diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f3ebf062e12962d4f0a0993f747d1aac5f3f2b0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12312115275b285b929467ed8ea3f948e42dcdbf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..967aa6235e1ef71da6dd21a4854171835ce416eb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc13af1ba147b6b882611c1b90ed59fc8279203c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__main__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__main__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82b705491b80c157fbde10299b11dac1d76f2e90 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__main__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a806a21f7b3e7d175eab7d4634c1fac85d427ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/cd.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/cd.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f71e88996bb0494c71ba5e296ce4a96f77a8d91 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/cd.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/constant.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/constant.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30a951e3c4ce158a77c4de2dfba5d63290dcda5b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/constant.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/legacy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/legacy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30e9aee02f4f395c04b4345a39e0446d63246bf4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/legacy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/md.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/md.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..607daa2281e872d441d93fc3e322b2313d497d33 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/md.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/models.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/models.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16f505f1590a32a616d07f0747e152e80afa519c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/models.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa89ef87aa60ed266adf428222e02ada99603058 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/version.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/version.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b30a36c417f0f8f3b4e6444bf00e0cb37f15b25 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/version.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..543a5a4de49d07690e73df778aa580589d0789c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__init__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +from .__main__ import cli_detect, query_yes_no + +__all__ = ( + "cli_detect", + "query_yes_no", +) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..64a290f2f86ff5c9a5c16ef4141bd49e3d9f3c8f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py @@ -0,0 +1,321 @@ +from __future__ import annotations + +import argparse +import sys +from json import dumps +from os.path import abspath, basename, dirname, join, realpath +from platform import python_version +from unicodedata import unidata_version + +import charset_normalizer.md as md_module +from charset_normalizer import from_fp +from charset_normalizer.models import CliDetectionResult +from charset_normalizer.version import __version__ + + +def query_yes_no(question: str, default: str = "yes") -> bool: + """Ask a yes/no question via input() and return their answer. + + "question" is a string that is presented to the user. + "default" is the presumed answer if the user just hits . + It must be "yes" (the default), "no" or None (meaning + an answer is required of the user). + + The "answer" return value is True for "yes" or False for "no". + + Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input + """ + valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} + if default is None: + prompt = " [y/n] " + elif default == "yes": + prompt = " [Y/n] " + elif default == "no": + prompt = " [y/N] " + else: + raise ValueError("invalid default answer: '%s'" % default) + + while True: + sys.stdout.write(question + prompt) + choice = input().lower() + if default is not None and choice == "": + return valid[default] + elif choice in valid: + return valid[choice] + else: + sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") + + +def cli_detect(argv: list[str] | None = None) -> int: + """ + CLI assistant using ARGV and ArgumentParser + :param argv: + :return: 0 if everything is fine, anything else equal trouble + """ + parser = argparse.ArgumentParser( + description="The Real First Universal Charset Detector. " + "Discover originating encoding used on text file. " + "Normalize text to unicode." + ) + + parser.add_argument( + "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed" + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + dest="verbose", + help="Display complementary information about file if any. " + "Stdout will contain logs about the detection process.", + ) + parser.add_argument( + "-a", + "--with-alternative", + action="store_true", + default=False, + dest="alternatives", + help="Output complementary possibilities if any. Top-level JSON WILL be a list.", + ) + parser.add_argument( + "-n", + "--normalize", + action="store_true", + default=False, + dest="normalize", + help="Permit to normalize input file. If not set, program does not write anything.", + ) + parser.add_argument( + "-m", + "--minimal", + action="store_true", + default=False, + dest="minimal", + help="Only output the charset detected to STDOUT. Disabling JSON output.", + ) + parser.add_argument( + "-r", + "--replace", + action="store_true", + default=False, + dest="replace", + help="Replace file when trying to normalize it instead of creating a new one.", + ) + parser.add_argument( + "-f", + "--force", + action="store_true", + default=False, + dest="force", + help="Replace file without asking if you are sure, use this flag with caution.", + ) + parser.add_argument( + "-i", + "--no-preemptive", + action="store_true", + default=False, + dest="no_preemptive", + help="Disable looking at a charset declaration to hint the detector.", + ) + parser.add_argument( + "-t", + "--threshold", + action="store", + default=0.2, + type=float, + dest="threshold", + help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.", + ) + parser.add_argument( + "--version", + action="version", + version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( + __version__, + python_version(), + unidata_version, + "OFF" if md_module.__file__.lower().endswith(".py") else "ON", + ), + help="Show version information and exit.", + ) + + args = parser.parse_args(argv) + + if args.replace is True and args.normalize is False: + if args.files: + for my_file in args.files: + my_file.close() + print("Use --replace in addition of --normalize only.", file=sys.stderr) + return 1 + + if args.force is True and args.replace is False: + if args.files: + for my_file in args.files: + my_file.close() + print("Use --force in addition of --replace only.", file=sys.stderr) + return 1 + + if args.threshold < 0.0 or args.threshold > 1.0: + if args.files: + for my_file in args.files: + my_file.close() + print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) + return 1 + + x_ = [] + + for my_file in args.files: + matches = from_fp( + my_file, + threshold=args.threshold, + explain=args.verbose, + preemptive_behaviour=args.no_preemptive is False, + ) + + best_guess = matches.best() + + if best_guess is None: + print( + 'Unable to identify originating encoding for "{}". {}'.format( + my_file.name, + ( + "Maybe try increasing maximum amount of chaos." + if args.threshold < 1.0 + else "" + ), + ), + file=sys.stderr, + ) + x_.append( + CliDetectionResult( + abspath(my_file.name), + None, + [], + [], + "Unknown", + [], + False, + 1.0, + 0.0, + None, + True, + ) + ) + else: + x_.append( + CliDetectionResult( + abspath(my_file.name), + best_guess.encoding, + best_guess.encoding_aliases, + [ + cp + for cp in best_guess.could_be_from_charset + if cp != best_guess.encoding + ], + best_guess.language, + best_guess.alphabets, + best_guess.bom, + best_guess.percent_chaos, + best_guess.percent_coherence, + None, + True, + ) + ) + + if len(matches) > 1 and args.alternatives: + for el in matches: + if el != best_guess: + x_.append( + CliDetectionResult( + abspath(my_file.name), + el.encoding, + el.encoding_aliases, + [ + cp + for cp in el.could_be_from_charset + if cp != el.encoding + ], + el.language, + el.alphabets, + el.bom, + el.percent_chaos, + el.percent_coherence, + None, + False, + ) + ) + + if args.normalize is True: + if best_guess.encoding.startswith("utf") is True: + print( + '"{}" file does not need to be normalized, as it already came from unicode.'.format( + my_file.name + ), + file=sys.stderr, + ) + if my_file.closed is False: + my_file.close() + continue + + dir_path = dirname(realpath(my_file.name)) + file_name = basename(realpath(my_file.name)) + + o_: list[str] = file_name.split(".") + + if args.replace is False: + o_.insert(-1, best_guess.encoding) + if my_file.closed is False: + my_file.close() + elif ( + args.force is False + and query_yes_no( + 'Are you sure to normalize "{}" by replacing it ?'.format( + my_file.name + ), + "no", + ) + is False + ): + if my_file.closed is False: + my_file.close() + continue + + try: + x_[0].unicode_path = join(dir_path, ".".join(o_)) + + with open(x_[0].unicode_path, "wb") as fp: + fp.write(best_guess.output()) + except OSError as e: + print(str(e), file=sys.stderr) + if my_file.closed is False: + my_file.close() + return 2 + + if my_file.closed is False: + my_file.close() + + if args.minimal is False: + print( + dumps( + [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, + ensure_ascii=True, + indent=4, + ) + ) + else: + for my_file in args.files: + print( + ", ".join( + [ + el.encoding or "undefined" + for el in x_ + if el.path == abspath(my_file.name) + ] + ) + ) + + return 0 + + +if __name__ == "__main__": + cli_detect() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d78988ebaa5c921590e5d0c213c125f320463fd2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dd3f9a811ac46cbccfefb3136d639f894a3ccba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..303980838117e13a0a98182159e06e5a301082a6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_convertions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_convertions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75ab7e24b13f77cea22f7fc36144749ba2448fe9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_convertions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_inspect.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_inspect.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47feae6d46509d9d21d2f0147a1857a8bc9c905f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_inspect.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_pep440.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_pep440.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a6328099baab22b528fd60fd0b776079bd894a0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_pep440.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3b24d633bb29573b808b5ca62fe3219008f263c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38a116d8967a0437ac571956b769e4512d7cf734 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_deprecations.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_deprecations.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23e1c6f355b7fd45eb6d0d00e57a07ce5a12e415 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_deprecations.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_regression.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_regression.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffa3b2eda05186bd03d0bcf529f54d1882f8ee79 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_regression.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_deprecations.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_deprecations.py new file mode 100644 index 0000000000000000000000000000000000000000..cd4c10832e7e7240175571605a07541f0c188f89 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_deprecations.py @@ -0,0 +1,20 @@ +"""Test deprecation and future warnings. + +""" +import numpy as np +from numpy.testing import assert_warns + + +def test_qr_mode_full_future_warning(): + """Check mode='full' FutureWarning. + + In numpy 1.8 the mode options 'full' and 'economic' in linalg.qr were + deprecated. The release date will probably be sometime in the summer + of 2013. + + """ + a = np.eye(2) + assert_warns(DeprecationWarning, np.linalg.qr, a, mode='full') + assert_warns(DeprecationWarning, np.linalg.qr, a, mode='f') + assert_warns(DeprecationWarning, np.linalg.qr, a, mode='economic') + assert_warns(DeprecationWarning, np.linalg.qr, a, mode='e') diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_linalg.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_linalg.py new file mode 100644 index 0000000000000000000000000000000000000000..5dabdfdf010a336741a1f89af101b36c4f62f5ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_linalg.py @@ -0,0 +1,2198 @@ +""" Test functions for linalg module + +""" +import os +import sys +import itertools +import traceback +import textwrap +import subprocess +import pytest + +import numpy as np +from numpy import array, single, double, csingle, cdouble, dot, identity, matmul +from numpy.core import swapaxes +from numpy import multiply, atleast_2d, inf, asarray +from numpy import linalg +from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError +from numpy.linalg.linalg import _multi_dot_matrix_chain_order +from numpy.testing import ( + assert_, assert_equal, assert_raises, assert_array_equal, + assert_almost_equal, assert_allclose, suppress_warnings, + assert_raises_regex, HAS_LAPACK64, IS_WASM + ) +try: + import numpy.linalg.lapack_lite +except ImportError: + # May be broken when numpy was built without BLAS/LAPACK present + # If so, ensure we don't break the whole test suite - the `lapack_lite` + # submodule should be removed, it's only used in two tests in this file. + pass + + +def consistent_subclass(out, in_): + # For ndarray subclass input, our output should have the same subclass + # (non-ndarray input gets converted to ndarray). + return type(out) is (type(in_) if isinstance(in_, np.ndarray) + else np.ndarray) + + +old_assert_almost_equal = assert_almost_equal + + +def assert_almost_equal(a, b, single_decimal=6, double_decimal=12, **kw): + if asarray(a).dtype.type in (single, csingle): + decimal = single_decimal + else: + decimal = double_decimal + old_assert_almost_equal(a, b, decimal=decimal, **kw) + + +def get_real_dtype(dtype): + return {single: single, double: double, + csingle: single, cdouble: double}[dtype] + + +def get_complex_dtype(dtype): + return {single: csingle, double: cdouble, + csingle: csingle, cdouble: cdouble}[dtype] + + +def get_rtol(dtype): + # Choose a safe rtol + if dtype in (single, csingle): + return 1e-5 + else: + return 1e-11 + + +# used to categorize tests +all_tags = { + 'square', 'nonsquare', 'hermitian', # mutually exclusive + 'generalized', 'size-0', 'strided' # optional additions +} + + +class LinalgCase: + def __init__(self, name, a, b, tags=set()): + """ + A bundle of arguments to be passed to a test case, with an identifying + name, the operands a and b, and a set of tags to filter the tests + """ + assert_(isinstance(name, str)) + self.name = name + self.a = a + self.b = b + self.tags = frozenset(tags) # prevent shared tags + + def check(self, do): + """ + Run the function `do` on this test case, expanding arguments + """ + do(self.a, self.b, tags=self.tags) + + def __repr__(self): + return f'' + + +def apply_tag(tag, cases): + """ + Add the given tag (a string) to each of the cases (a list of LinalgCase + objects) + """ + assert tag in all_tags, "Invalid tag" + for case in cases: + case.tags = case.tags | {tag} + return cases + + +# +# Base test cases +# + +np.random.seed(1234) + +CASES = [] + +# square test cases +CASES += apply_tag('square', [ + LinalgCase("single", + array([[1., 2.], [3., 4.]], dtype=single), + array([2., 1.], dtype=single)), + LinalgCase("double", + array([[1., 2.], [3., 4.]], dtype=double), + array([2., 1.], dtype=double)), + LinalgCase("double_2", + array([[1., 2.], [3., 4.]], dtype=double), + array([[2., 1., 4.], [3., 4., 6.]], dtype=double)), + LinalgCase("csingle", + array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=csingle), + array([2. + 1j, 1. + 2j], dtype=csingle)), + LinalgCase("cdouble", + array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=cdouble), + array([2. + 1j, 1. + 2j], dtype=cdouble)), + LinalgCase("cdouble_2", + array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=cdouble), + array([[2. + 1j, 1. + 2j, 1 + 3j], [1 - 2j, 1 - 3j, 1 - 6j]], dtype=cdouble)), + LinalgCase("0x0", + np.empty((0, 0), dtype=double), + np.empty((0,), dtype=double), + tags={'size-0'}), + LinalgCase("8x8", + np.random.rand(8, 8), + np.random.rand(8)), + LinalgCase("1x1", + np.random.rand(1, 1), + np.random.rand(1)), + LinalgCase("nonarray", + [[1, 2], [3, 4]], + [2, 1]), +]) + +# non-square test-cases +CASES += apply_tag('nonsquare', [ + LinalgCase("single_nsq_1", + array([[1., 2., 3.], [3., 4., 6.]], dtype=single), + array([2., 1.], dtype=single)), + LinalgCase("single_nsq_2", + array([[1., 2.], [3., 4.], [5., 6.]], dtype=single), + array([2., 1., 3.], dtype=single)), + LinalgCase("double_nsq_1", + array([[1., 2., 3.], [3., 4., 6.]], dtype=double), + array([2., 1.], dtype=double)), + LinalgCase("double_nsq_2", + array([[1., 2.], [3., 4.], [5., 6.]], dtype=double), + array([2., 1., 3.], dtype=double)), + LinalgCase("csingle_nsq_1", + array( + [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=csingle), + array([2. + 1j, 1. + 2j], dtype=csingle)), + LinalgCase("csingle_nsq_2", + array( + [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=csingle), + array([2. + 1j, 1. + 2j, 3. - 3j], dtype=csingle)), + LinalgCase("cdouble_nsq_1", + array( + [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=cdouble), + array([2. + 1j, 1. + 2j], dtype=cdouble)), + LinalgCase("cdouble_nsq_2", + array( + [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=cdouble), + array([2. + 1j, 1. + 2j, 3. - 3j], dtype=cdouble)), + LinalgCase("cdouble_nsq_1_2", + array( + [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=cdouble), + array([[2. + 1j, 1. + 2j], [1 - 1j, 2 - 2j]], dtype=cdouble)), + LinalgCase("cdouble_nsq_2_2", + array( + [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=cdouble), + array([[2. + 1j, 1. + 2j], [1 - 1j, 2 - 2j], [1 - 1j, 2 - 2j]], dtype=cdouble)), + LinalgCase("8x11", + np.random.rand(8, 11), + np.random.rand(8)), + LinalgCase("1x5", + np.random.rand(1, 5), + np.random.rand(1)), + LinalgCase("5x1", + np.random.rand(5, 1), + np.random.rand(5)), + LinalgCase("0x4", + np.random.rand(0, 4), + np.random.rand(0), + tags={'size-0'}), + LinalgCase("4x0", + np.random.rand(4, 0), + np.random.rand(4), + tags={'size-0'}), +]) + +# hermitian test-cases +CASES += apply_tag('hermitian', [ + LinalgCase("hsingle", + array([[1., 2.], [2., 1.]], dtype=single), + None), + LinalgCase("hdouble", + array([[1., 2.], [2., 1.]], dtype=double), + None), + LinalgCase("hcsingle", + array([[1., 2 + 3j], [2 - 3j, 1]], dtype=csingle), + None), + LinalgCase("hcdouble", + array([[1., 2 + 3j], [2 - 3j, 1]], dtype=cdouble), + None), + LinalgCase("hempty", + np.empty((0, 0), dtype=double), + None, + tags={'size-0'}), + LinalgCase("hnonarray", + [[1, 2], [2, 1]], + None), + LinalgCase("matrix_b_only", + array([[1., 2.], [2., 1.]]), + None), + LinalgCase("hmatrix_1x1", + np.random.rand(1, 1), + None), +]) + + +# +# Gufunc test cases +# +def _make_generalized_cases(): + new_cases = [] + + for case in CASES: + if not isinstance(case.a, np.ndarray): + continue + + a = np.array([case.a, 2 * case.a, 3 * case.a]) + if case.b is None: + b = None + else: + b = np.array([case.b, 7 * case.b, 6 * case.b]) + new_case = LinalgCase(case.name + "_tile3", a, b, + tags=case.tags | {'generalized'}) + new_cases.append(new_case) + + a = np.array([case.a] * 2 * 3).reshape((3, 2) + case.a.shape) + if case.b is None: + b = None + else: + b = np.array([case.b] * 2 * 3).reshape((3, 2) + case.b.shape) + new_case = LinalgCase(case.name + "_tile213", a, b, + tags=case.tags | {'generalized'}) + new_cases.append(new_case) + + return new_cases + + +CASES += _make_generalized_cases() + + +# +# Generate stride combination variations of the above +# +def _stride_comb_iter(x): + """ + Generate cartesian product of strides for all axes + """ + + if not isinstance(x, np.ndarray): + yield x, "nop" + return + + stride_set = [(1,)] * x.ndim + stride_set[-1] = (1, 3, -4) + if x.ndim > 1: + stride_set[-2] = (1, 3, -4) + if x.ndim > 2: + stride_set[-3] = (1, -4) + + for repeats in itertools.product(*tuple(stride_set)): + new_shape = [abs(a * b) for a, b in zip(x.shape, repeats)] + slices = tuple([slice(None, None, repeat) for repeat in repeats]) + + # new array with different strides, but same data + xi = np.empty(new_shape, dtype=x.dtype) + xi.view(np.uint32).fill(0xdeadbeef) + xi = xi[slices] + xi[...] = x + xi = xi.view(x.__class__) + assert_(np.all(xi == x)) + yield xi, "stride_" + "_".join(["%+d" % j for j in repeats]) + + # generate also zero strides if possible + if x.ndim >= 1 and x.shape[-1] == 1: + s = list(x.strides) + s[-1] = 0 + xi = np.lib.stride_tricks.as_strided(x, strides=s) + yield xi, "stride_xxx_0" + if x.ndim >= 2 and x.shape[-2] == 1: + s = list(x.strides) + s[-2] = 0 + xi = np.lib.stride_tricks.as_strided(x, strides=s) + yield xi, "stride_xxx_0_x" + if x.ndim >= 2 and x.shape[:-2] == (1, 1): + s = list(x.strides) + s[-1] = 0 + s[-2] = 0 + xi = np.lib.stride_tricks.as_strided(x, strides=s) + yield xi, "stride_xxx_0_0" + + +def _make_strided_cases(): + new_cases = [] + for case in CASES: + for a, a_label in _stride_comb_iter(case.a): + for b, b_label in _stride_comb_iter(case.b): + new_case = LinalgCase(case.name + "_" + a_label + "_" + b_label, a, b, + tags=case.tags | {'strided'}) + new_cases.append(new_case) + return new_cases + + +CASES += _make_strided_cases() + + +# +# Test different routines against the above cases +# +class LinalgTestCase: + TEST_CASES = CASES + + def check_cases(self, require=set(), exclude=set()): + """ + Run func on each of the cases with all of the tags in require, and none + of the tags in exclude + """ + for case in self.TEST_CASES: + # filter by require and exclude + if case.tags & require != require: + continue + if case.tags & exclude: + continue + + try: + case.check(self.do) + except Exception as e: + msg = f'In test case: {case!r}\n\n' + msg += traceback.format_exc() + raise AssertionError(msg) from e + + +class LinalgSquareTestCase(LinalgTestCase): + + def test_sq_cases(self): + self.check_cases(require={'square'}, + exclude={'generalized', 'size-0'}) + + def test_empty_sq_cases(self): + self.check_cases(require={'square', 'size-0'}, + exclude={'generalized'}) + + +class LinalgNonsquareTestCase(LinalgTestCase): + + def test_nonsq_cases(self): + self.check_cases(require={'nonsquare'}, + exclude={'generalized', 'size-0'}) + + def test_empty_nonsq_cases(self): + self.check_cases(require={'nonsquare', 'size-0'}, + exclude={'generalized'}) + + +class HermitianTestCase(LinalgTestCase): + + def test_herm_cases(self): + self.check_cases(require={'hermitian'}, + exclude={'generalized', 'size-0'}) + + def test_empty_herm_cases(self): + self.check_cases(require={'hermitian', 'size-0'}, + exclude={'generalized'}) + + +class LinalgGeneralizedSquareTestCase(LinalgTestCase): + + @pytest.mark.slow + def test_generalized_sq_cases(self): + self.check_cases(require={'generalized', 'square'}, + exclude={'size-0'}) + + @pytest.mark.slow + def test_generalized_empty_sq_cases(self): + self.check_cases(require={'generalized', 'square', 'size-0'}) + + +class LinalgGeneralizedNonsquareTestCase(LinalgTestCase): + + @pytest.mark.slow + def test_generalized_nonsq_cases(self): + self.check_cases(require={'generalized', 'nonsquare'}, + exclude={'size-0'}) + + @pytest.mark.slow + def test_generalized_empty_nonsq_cases(self): + self.check_cases(require={'generalized', 'nonsquare', 'size-0'}) + + +class HermitianGeneralizedTestCase(LinalgTestCase): + + @pytest.mark.slow + def test_generalized_herm_cases(self): + self.check_cases(require={'generalized', 'hermitian'}, + exclude={'size-0'}) + + @pytest.mark.slow + def test_generalized_empty_herm_cases(self): + self.check_cases(require={'generalized', 'hermitian', 'size-0'}, + exclude={'none'}) + + +def dot_generalized(a, b): + a = asarray(a) + if a.ndim >= 3: + if a.ndim == b.ndim: + # matrix x matrix + new_shape = a.shape[:-1] + b.shape[-1:] + elif a.ndim == b.ndim + 1: + # matrix x vector + new_shape = a.shape[:-1] + else: + raise ValueError("Not implemented...") + r = np.empty(new_shape, dtype=np.common_type(a, b)) + for c in itertools.product(*map(range, a.shape[:-2])): + r[c] = dot(a[c], b[c]) + return r + else: + return dot(a, b) + + +def identity_like_generalized(a): + a = asarray(a) + if a.ndim >= 3: + r = np.empty(a.shape, dtype=a.dtype) + r[...] = identity(a.shape[-2]) + return r + else: + return identity(a.shape[0]) + + +class SolveCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase): + # kept apart from TestSolve for use for testing with matrices. + def do(self, a, b, tags): + x = linalg.solve(a, b) + assert_almost_equal(b, dot_generalized(a, x)) + assert_(consistent_subclass(x, b)) + + +class TestSolve(SolveCases): + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + assert_equal(linalg.solve(x, x).dtype, dtype) + + def test_0_size(self): + class ArraySubclass(np.ndarray): + pass + # Test system of 0x0 matrices + a = np.arange(8).reshape(2, 2, 2) + b = np.arange(6).reshape(1, 2, 3).view(ArraySubclass) + + expected = linalg.solve(a, b)[:, 0:0, :] + result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0, :]) + assert_array_equal(result, expected) + assert_(isinstance(result, ArraySubclass)) + + # Test errors for non-square and only b's dimension being 0 + assert_raises(linalg.LinAlgError, linalg.solve, a[:, 0:0, 0:1], b) + assert_raises(ValueError, linalg.solve, a, b[:, 0:0, :]) + + # Test broadcasting error + b = np.arange(6).reshape(1, 3, 2) # broadcasting error + assert_raises(ValueError, linalg.solve, a, b) + assert_raises(ValueError, linalg.solve, a[0:0], b[0:0]) + + # Test zero "single equations" with 0x0 matrices. + b = np.arange(2).reshape(1, 2).view(ArraySubclass) + expected = linalg.solve(a, b)[:, 0:0] + result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0]) + assert_array_equal(result, expected) + assert_(isinstance(result, ArraySubclass)) + + b = np.arange(3).reshape(1, 3) + assert_raises(ValueError, linalg.solve, a, b) + assert_raises(ValueError, linalg.solve, a[0:0], b[0:0]) + assert_raises(ValueError, linalg.solve, a[:, 0:0, 0:0], b) + + def test_0_size_k(self): + # test zero multiple equation (K=0) case. + class ArraySubclass(np.ndarray): + pass + a = np.arange(4).reshape(1, 2, 2) + b = np.arange(6).reshape(3, 2, 1).view(ArraySubclass) + + expected = linalg.solve(a, b)[:, :, 0:0] + result = linalg.solve(a, b[:, :, 0:0]) + assert_array_equal(result, expected) + assert_(isinstance(result, ArraySubclass)) + + # test both zero. + expected = linalg.solve(a, b)[:, 0:0, 0:0] + result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0, 0:0]) + assert_array_equal(result, expected) + assert_(isinstance(result, ArraySubclass)) + + +class InvCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase): + + def do(self, a, b, tags): + a_inv = linalg.inv(a) + assert_almost_equal(dot_generalized(a, a_inv), + identity_like_generalized(a)) + assert_(consistent_subclass(a_inv, a)) + + +class TestInv(InvCases): + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + assert_equal(linalg.inv(x).dtype, dtype) + + def test_0_size(self): + # Check that all kinds of 0-sized arrays work + class ArraySubclass(np.ndarray): + pass + a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass) + res = linalg.inv(a) + assert_(res.dtype.type is np.float64) + assert_equal(a.shape, res.shape) + assert_(isinstance(res, ArraySubclass)) + + a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass) + res = linalg.inv(a) + assert_(res.dtype.type is np.complex64) + assert_equal(a.shape, res.shape) + assert_(isinstance(res, ArraySubclass)) + + +class EigvalsCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase): + + def do(self, a, b, tags): + ev = linalg.eigvals(a) + evalues, evectors = linalg.eig(a) + assert_almost_equal(ev, evalues) + + +class TestEigvals(EigvalsCases): + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + assert_equal(linalg.eigvals(x).dtype, dtype) + x = np.array([[1, 0.5], [-1, 1]], dtype=dtype) + assert_equal(linalg.eigvals(x).dtype, get_complex_dtype(dtype)) + + def test_0_size(self): + # Check that all kinds of 0-sized arrays work + class ArraySubclass(np.ndarray): + pass + a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass) + res = linalg.eigvals(a) + assert_(res.dtype.type is np.float64) + assert_equal((0, 1), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(res, np.ndarray)) + + a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass) + res = linalg.eigvals(a) + assert_(res.dtype.type is np.complex64) + assert_equal((0,), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(res, np.ndarray)) + + +class EigCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase): + + def do(self, a, b, tags): + res = linalg.eig(a) + eigenvalues, eigenvectors = res.eigenvalues, res.eigenvectors + assert_allclose(dot_generalized(a, eigenvectors), + np.asarray(eigenvectors) * np.asarray(eigenvalues)[..., None, :], + rtol=get_rtol(eigenvalues.dtype)) + assert_(consistent_subclass(eigenvectors, a)) + + +class TestEig(EigCases): + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + w, v = np.linalg.eig(x) + assert_equal(w.dtype, dtype) + assert_equal(v.dtype, dtype) + + x = np.array([[1, 0.5], [-1, 1]], dtype=dtype) + w, v = np.linalg.eig(x) + assert_equal(w.dtype, get_complex_dtype(dtype)) + assert_equal(v.dtype, get_complex_dtype(dtype)) + + def test_0_size(self): + # Check that all kinds of 0-sized arrays work + class ArraySubclass(np.ndarray): + pass + a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass) + res, res_v = linalg.eig(a) + assert_(res_v.dtype.type is np.float64) + assert_(res.dtype.type is np.float64) + assert_equal(a.shape, res_v.shape) + assert_equal((0, 1), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(a, np.ndarray)) + + a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass) + res, res_v = linalg.eig(a) + assert_(res_v.dtype.type is np.complex64) + assert_(res.dtype.type is np.complex64) + assert_equal(a.shape, res_v.shape) + assert_equal((0,), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(a, np.ndarray)) + + +class SVDBaseTests: + hermitian = False + + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + res = linalg.svd(x) + U, S, Vh = res.U, res.S, res.Vh + assert_equal(U.dtype, dtype) + assert_equal(S.dtype, get_real_dtype(dtype)) + assert_equal(Vh.dtype, dtype) + s = linalg.svd(x, compute_uv=False, hermitian=self.hermitian) + assert_equal(s.dtype, get_real_dtype(dtype)) + + +class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase): + + def do(self, a, b, tags): + u, s, vt = linalg.svd(a, False) + assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :], + np.asarray(vt)), + rtol=get_rtol(u.dtype)) + assert_(consistent_subclass(u, a)) + assert_(consistent_subclass(vt, a)) + + +class TestSVD(SVDCases, SVDBaseTests): + def test_empty_identity(self): + """ Empty input should put an identity matrix in u or vh """ + x = np.empty((4, 0)) + u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian) + assert_equal(u.shape, (4, 4)) + assert_equal(vh.shape, (0, 0)) + assert_equal(u, np.eye(4)) + + x = np.empty((0, 4)) + u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian) + assert_equal(u.shape, (0, 0)) + assert_equal(vh.shape, (4, 4)) + assert_equal(vh, np.eye(4)) + + +class SVDHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase): + + def do(self, a, b, tags): + u, s, vt = linalg.svd(a, False, hermitian=True) + assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :], + np.asarray(vt)), + rtol=get_rtol(u.dtype)) + def hermitian(mat): + axes = list(range(mat.ndim)) + axes[-1], axes[-2] = axes[-2], axes[-1] + return np.conj(np.transpose(mat, axes=axes)) + + assert_almost_equal(np.matmul(u, hermitian(u)), np.broadcast_to(np.eye(u.shape[-1]), u.shape)) + assert_almost_equal(np.matmul(vt, hermitian(vt)), np.broadcast_to(np.eye(vt.shape[-1]), vt.shape)) + assert_equal(np.sort(s)[..., ::-1], s) + assert_(consistent_subclass(u, a)) + assert_(consistent_subclass(vt, a)) + + +class TestSVDHermitian(SVDHermitianCases, SVDBaseTests): + hermitian = True + + +class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase): + # cond(x, p) for p in (None, 2, -2) + + def do(self, a, b, tags): + c = asarray(a) # a might be a matrix + if 'size-0' in tags: + assert_raises(LinAlgError, linalg.cond, c) + return + + # +-2 norms + s = linalg.svd(c, compute_uv=False) + assert_almost_equal( + linalg.cond(a), s[..., 0] / s[..., -1], + single_decimal=5, double_decimal=11) + assert_almost_equal( + linalg.cond(a, 2), s[..., 0] / s[..., -1], + single_decimal=5, double_decimal=11) + assert_almost_equal( + linalg.cond(a, -2), s[..., -1] / s[..., 0], + single_decimal=5, double_decimal=11) + + # Other norms + cinv = np.linalg.inv(c) + assert_almost_equal( + linalg.cond(a, 1), + abs(c).sum(-2).max(-1) * abs(cinv).sum(-2).max(-1), + single_decimal=5, double_decimal=11) + assert_almost_equal( + linalg.cond(a, -1), + abs(c).sum(-2).min(-1) * abs(cinv).sum(-2).min(-1), + single_decimal=5, double_decimal=11) + assert_almost_equal( + linalg.cond(a, np.inf), + abs(c).sum(-1).max(-1) * abs(cinv).sum(-1).max(-1), + single_decimal=5, double_decimal=11) + assert_almost_equal( + linalg.cond(a, -np.inf), + abs(c).sum(-1).min(-1) * abs(cinv).sum(-1).min(-1), + single_decimal=5, double_decimal=11) + assert_almost_equal( + linalg.cond(a, 'fro'), + np.sqrt((abs(c)**2).sum(-1).sum(-1) + * (abs(cinv)**2).sum(-1).sum(-1)), + single_decimal=5, double_decimal=11) + + +class TestCond(CondCases): + def test_basic_nonsvd(self): + # Smoketest the non-svd norms + A = array([[1., 0, 1], [0, -2., 0], [0, 0, 3.]]) + assert_almost_equal(linalg.cond(A, inf), 4) + assert_almost_equal(linalg.cond(A, -inf), 2/3) + assert_almost_equal(linalg.cond(A, 1), 4) + assert_almost_equal(linalg.cond(A, -1), 0.5) + assert_almost_equal(linalg.cond(A, 'fro'), np.sqrt(265 / 12)) + + def test_singular(self): + # Singular matrices have infinite condition number for + # positive norms, and negative norms shouldn't raise + # exceptions + As = [np.zeros((2, 2)), np.ones((2, 2))] + p_pos = [None, 1, 2, 'fro'] + p_neg = [-1, -2] + for A, p in itertools.product(As, p_pos): + # Inversion may not hit exact infinity, so just check the + # number is large + assert_(linalg.cond(A, p) > 1e15) + for A, p in itertools.product(As, p_neg): + linalg.cond(A, p) + + @pytest.mark.xfail(True, run=False, + reason="Platform/LAPACK-dependent failure, " + "see gh-18914") + def test_nan(self): + # nans should be passed through, not converted to infs + ps = [None, 1, -1, 2, -2, 'fro'] + p_pos = [None, 1, 2, 'fro'] + + A = np.ones((2, 2)) + A[0,1] = np.nan + for p in ps: + c = linalg.cond(A, p) + assert_(isinstance(c, np.float_)) + assert_(np.isnan(c)) + + A = np.ones((3, 2, 2)) + A[1,0,1] = np.nan + for p in ps: + c = linalg.cond(A, p) + assert_(np.isnan(c[1])) + if p in p_pos: + assert_(c[0] > 1e15) + assert_(c[2] > 1e15) + else: + assert_(not np.isnan(c[0])) + assert_(not np.isnan(c[2])) + + def test_stacked_singular(self): + # Check behavior when only some of the stacked matrices are + # singular + np.random.seed(1234) + A = np.random.rand(2, 2, 2, 2) + A[0,0] = 0 + A[1,1] = 0 + + for p in (None, 1, 2, 'fro', -1, -2): + c = linalg.cond(A, p) + assert_equal(c[0,0], np.inf) + assert_equal(c[1,1], np.inf) + assert_(np.isfinite(c[0,1])) + assert_(np.isfinite(c[1,0])) + + +class PinvCases(LinalgSquareTestCase, + LinalgNonsquareTestCase, + LinalgGeneralizedSquareTestCase, + LinalgGeneralizedNonsquareTestCase): + + def do(self, a, b, tags): + a_ginv = linalg.pinv(a) + # `a @ a_ginv == I` does not hold if a is singular + dot = dot_generalized + assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11) + assert_(consistent_subclass(a_ginv, a)) + + +class TestPinv(PinvCases): + pass + + +class PinvHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase): + + def do(self, a, b, tags): + a_ginv = linalg.pinv(a, hermitian=True) + # `a @ a_ginv == I` does not hold if a is singular + dot = dot_generalized + assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11) + assert_(consistent_subclass(a_ginv, a)) + + +class TestPinvHermitian(PinvHermitianCases): + pass + + +class DetCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase): + + def do(self, a, b, tags): + d = linalg.det(a) + res = linalg.slogdet(a) + s, ld = res.sign, res.logabsdet + if asarray(a).dtype.type in (single, double): + ad = asarray(a).astype(double) + else: + ad = asarray(a).astype(cdouble) + ev = linalg.eigvals(ad) + assert_almost_equal(d, multiply.reduce(ev, axis=-1)) + assert_almost_equal(s * np.exp(ld), multiply.reduce(ev, axis=-1)) + + s = np.atleast_1d(s) + ld = np.atleast_1d(ld) + m = (s != 0) + assert_almost_equal(np.abs(s[m]), 1) + assert_equal(ld[~m], -inf) + + +class TestDet(DetCases): + def test_zero(self): + assert_equal(linalg.det([[0.0]]), 0.0) + assert_equal(type(linalg.det([[0.0]])), double) + assert_equal(linalg.det([[0.0j]]), 0.0) + assert_equal(type(linalg.det([[0.0j]])), cdouble) + + assert_equal(linalg.slogdet([[0.0]]), (0.0, -inf)) + assert_equal(type(linalg.slogdet([[0.0]])[0]), double) + assert_equal(type(linalg.slogdet([[0.0]])[1]), double) + assert_equal(linalg.slogdet([[0.0j]]), (0.0j, -inf)) + assert_equal(type(linalg.slogdet([[0.0j]])[0]), cdouble) + assert_equal(type(linalg.slogdet([[0.0j]])[1]), double) + + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + assert_equal(np.linalg.det(x).dtype, dtype) + ph, s = np.linalg.slogdet(x) + assert_equal(s.dtype, get_real_dtype(dtype)) + assert_equal(ph.dtype, dtype) + + def test_0_size(self): + a = np.zeros((0, 0), dtype=np.complex64) + res = linalg.det(a) + assert_equal(res, 1.) + assert_(res.dtype.type is np.complex64) + res = linalg.slogdet(a) + assert_equal(res, (1, 0)) + assert_(res[0].dtype.type is np.complex64) + assert_(res[1].dtype.type is np.float32) + + a = np.zeros((0, 0), dtype=np.float64) + res = linalg.det(a) + assert_equal(res, 1.) + assert_(res.dtype.type is np.float64) + res = linalg.slogdet(a) + assert_equal(res, (1, 0)) + assert_(res[0].dtype.type is np.float64) + assert_(res[1].dtype.type is np.float64) + + +class LstsqCases(LinalgSquareTestCase, LinalgNonsquareTestCase): + + def do(self, a, b, tags): + arr = np.asarray(a) + m, n = arr.shape + u, s, vt = linalg.svd(a, False) + x, residuals, rank, sv = linalg.lstsq(a, b, rcond=-1) + if m == 0: + assert_((x == 0).all()) + if m <= n: + assert_almost_equal(b, dot(a, x)) + assert_equal(rank, m) + else: + assert_equal(rank, n) + assert_almost_equal(sv, sv.__array_wrap__(s)) + if rank == n and m > n: + expect_resids = ( + np.asarray(abs(np.dot(a, x) - b)) ** 2).sum(axis=0) + expect_resids = np.asarray(expect_resids) + if np.asarray(b).ndim == 1: + expect_resids.shape = (1,) + assert_equal(residuals.shape, expect_resids.shape) + else: + expect_resids = np.array([]).view(type(x)) + assert_almost_equal(residuals, expect_resids) + assert_(np.issubdtype(residuals.dtype, np.floating)) + assert_(consistent_subclass(x, b)) + assert_(consistent_subclass(residuals, b)) + + +class TestLstsq(LstsqCases): + def test_future_rcond(self): + a = np.array([[0., 1., 0., 1., 2., 0.], + [0., 2., 0., 0., 1., 0.], + [1., 0., 1., 0., 0., 4.], + [0., 0., 0., 2., 3., 0.]]).T + + b = np.array([1, 0, 0, 0, 0, 0]) + with suppress_warnings() as sup: + w = sup.record(FutureWarning, "`rcond` parameter will change") + x, residuals, rank, s = linalg.lstsq(a, b) + assert_(rank == 4) + x, residuals, rank, s = linalg.lstsq(a, b, rcond=-1) + assert_(rank == 4) + x, residuals, rank, s = linalg.lstsq(a, b, rcond=None) + assert_(rank == 3) + # Warning should be raised exactly once (first command) + assert_(len(w) == 1) + + @pytest.mark.parametrize(["m", "n", "n_rhs"], [ + (4, 2, 2), + (0, 4, 1), + (0, 4, 2), + (4, 0, 1), + (4, 0, 2), + (4, 2, 0), + (0, 0, 0) + ]) + def test_empty_a_b(self, m, n, n_rhs): + a = np.arange(m * n).reshape(m, n) + b = np.ones((m, n_rhs)) + x, residuals, rank, s = linalg.lstsq(a, b, rcond=None) + if m == 0: + assert_((x == 0).all()) + assert_equal(x.shape, (n, n_rhs)) + assert_equal(residuals.shape, ((n_rhs,) if m > n else (0,))) + if m > n and n_rhs > 0: + # residuals are exactly the squared norms of b's columns + r = b - np.dot(a, x) + assert_almost_equal(residuals, (r * r).sum(axis=-2)) + assert_equal(rank, min(m, n)) + assert_equal(s.shape, (min(m, n),)) + + def test_incompatible_dims(self): + # use modified version of docstring example + x = np.array([0, 1, 2, 3]) + y = np.array([-1, 0.2, 0.9, 2.1, 3.3]) + A = np.vstack([x, np.ones(len(x))]).T + with assert_raises_regex(LinAlgError, "Incompatible dimensions"): + linalg.lstsq(A, y, rcond=None) + + +@pytest.mark.parametrize('dt', [np.dtype(c) for c in '?bBhHiIqQefdgFDGO']) +class TestMatrixPower: + + rshft_0 = np.eye(4) + rshft_1 = rshft_0[[3, 0, 1, 2]] + rshft_2 = rshft_0[[2, 3, 0, 1]] + rshft_3 = rshft_0[[1, 2, 3, 0]] + rshft_all = [rshft_0, rshft_1, rshft_2, rshft_3] + noninv = array([[1, 0], [0, 0]]) + stacked = np.block([[[rshft_0]]]*2) + #FIXME the 'e' dtype might work in future + dtnoinv = [object, np.dtype('e'), np.dtype('g'), np.dtype('G')] + + def test_large_power(self, dt): + rshft = self.rshft_1.astype(dt) + assert_equal( + matrix_power(rshft, 2**100 + 2**10 + 2**5 + 0), self.rshft_0) + assert_equal( + matrix_power(rshft, 2**100 + 2**10 + 2**5 + 1), self.rshft_1) + assert_equal( + matrix_power(rshft, 2**100 + 2**10 + 2**5 + 2), self.rshft_2) + assert_equal( + matrix_power(rshft, 2**100 + 2**10 + 2**5 + 3), self.rshft_3) + + def test_power_is_zero(self, dt): + def tz(M): + mz = matrix_power(M, 0) + assert_equal(mz, identity_like_generalized(M)) + assert_equal(mz.dtype, M.dtype) + + for mat in self.rshft_all: + tz(mat.astype(dt)) + if dt != object: + tz(self.stacked.astype(dt)) + + def test_power_is_one(self, dt): + def tz(mat): + mz = matrix_power(mat, 1) + assert_equal(mz, mat) + assert_equal(mz.dtype, mat.dtype) + + for mat in self.rshft_all: + tz(mat.astype(dt)) + if dt != object: + tz(self.stacked.astype(dt)) + + def test_power_is_two(self, dt): + def tz(mat): + mz = matrix_power(mat, 2) + mmul = matmul if mat.dtype != object else dot + assert_equal(mz, mmul(mat, mat)) + assert_equal(mz.dtype, mat.dtype) + + for mat in self.rshft_all: + tz(mat.astype(dt)) + if dt != object: + tz(self.stacked.astype(dt)) + + def test_power_is_minus_one(self, dt): + def tz(mat): + invmat = matrix_power(mat, -1) + mmul = matmul if mat.dtype != object else dot + assert_almost_equal( + mmul(invmat, mat), identity_like_generalized(mat)) + + for mat in self.rshft_all: + if dt not in self.dtnoinv: + tz(mat.astype(dt)) + + def test_exceptions_bad_power(self, dt): + mat = self.rshft_0.astype(dt) + assert_raises(TypeError, matrix_power, mat, 1.5) + assert_raises(TypeError, matrix_power, mat, [1]) + + def test_exceptions_non_square(self, dt): + assert_raises(LinAlgError, matrix_power, np.array([1], dt), 1) + assert_raises(LinAlgError, matrix_power, np.array([[1], [2]], dt), 1) + assert_raises(LinAlgError, matrix_power, np.ones((4, 3, 2), dt), 1) + + @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm") + def test_exceptions_not_invertible(self, dt): + if dt in self.dtnoinv: + return + mat = self.noninv.astype(dt) + assert_raises(LinAlgError, matrix_power, mat, -1) + + +class TestEigvalshCases(HermitianTestCase, HermitianGeneralizedTestCase): + + def do(self, a, b, tags): + # note that eigenvalue arrays returned by eig must be sorted since + # their order isn't guaranteed. + ev = linalg.eigvalsh(a, 'L') + evalues, evectors = linalg.eig(a) + evalues.sort(axis=-1) + assert_allclose(ev, evalues, rtol=get_rtol(ev.dtype)) + + ev2 = linalg.eigvalsh(a, 'U') + assert_allclose(ev2, evalues, rtol=get_rtol(ev.dtype)) + + +class TestEigvalsh: + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + w = np.linalg.eigvalsh(x) + assert_equal(w.dtype, get_real_dtype(dtype)) + + def test_invalid(self): + x = np.array([[1, 0.5], [0.5, 1]], dtype=np.float32) + assert_raises(ValueError, np.linalg.eigvalsh, x, UPLO="lrong") + assert_raises(ValueError, np.linalg.eigvalsh, x, "lower") + assert_raises(ValueError, np.linalg.eigvalsh, x, "upper") + + def test_UPLO(self): + Klo = np.array([[0, 0], [1, 0]], dtype=np.double) + Kup = np.array([[0, 1], [0, 0]], dtype=np.double) + tgt = np.array([-1, 1], dtype=np.double) + rtol = get_rtol(np.double) + + # Check default is 'L' + w = np.linalg.eigvalsh(Klo) + assert_allclose(w, tgt, rtol=rtol) + # Check 'L' + w = np.linalg.eigvalsh(Klo, UPLO='L') + assert_allclose(w, tgt, rtol=rtol) + # Check 'l' + w = np.linalg.eigvalsh(Klo, UPLO='l') + assert_allclose(w, tgt, rtol=rtol) + # Check 'U' + w = np.linalg.eigvalsh(Kup, UPLO='U') + assert_allclose(w, tgt, rtol=rtol) + # Check 'u' + w = np.linalg.eigvalsh(Kup, UPLO='u') + assert_allclose(w, tgt, rtol=rtol) + + def test_0_size(self): + # Check that all kinds of 0-sized arrays work + class ArraySubclass(np.ndarray): + pass + a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass) + res = linalg.eigvalsh(a) + assert_(res.dtype.type is np.float64) + assert_equal((0, 1), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(res, np.ndarray)) + + a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass) + res = linalg.eigvalsh(a) + assert_(res.dtype.type is np.float32) + assert_equal((0,), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(res, np.ndarray)) + + +class TestEighCases(HermitianTestCase, HermitianGeneralizedTestCase): + + def do(self, a, b, tags): + # note that eigenvalue arrays returned by eig must be sorted since + # their order isn't guaranteed. + res = linalg.eigh(a) + ev, evc = res.eigenvalues, res.eigenvectors + evalues, evectors = linalg.eig(a) + evalues.sort(axis=-1) + assert_almost_equal(ev, evalues) + + assert_allclose(dot_generalized(a, evc), + np.asarray(ev)[..., None, :] * np.asarray(evc), + rtol=get_rtol(ev.dtype)) + + ev2, evc2 = linalg.eigh(a, 'U') + assert_almost_equal(ev2, evalues) + + assert_allclose(dot_generalized(a, evc2), + np.asarray(ev2)[..., None, :] * np.asarray(evc2), + rtol=get_rtol(ev.dtype), err_msg=repr(a)) + + +class TestEigh: + @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble]) + def test_types(self, dtype): + x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype) + w, v = np.linalg.eigh(x) + assert_equal(w.dtype, get_real_dtype(dtype)) + assert_equal(v.dtype, dtype) + + def test_invalid(self): + x = np.array([[1, 0.5], [0.5, 1]], dtype=np.float32) + assert_raises(ValueError, np.linalg.eigh, x, UPLO="lrong") + assert_raises(ValueError, np.linalg.eigh, x, "lower") + assert_raises(ValueError, np.linalg.eigh, x, "upper") + + def test_UPLO(self): + Klo = np.array([[0, 0], [1, 0]], dtype=np.double) + Kup = np.array([[0, 1], [0, 0]], dtype=np.double) + tgt = np.array([-1, 1], dtype=np.double) + rtol = get_rtol(np.double) + + # Check default is 'L' + w, v = np.linalg.eigh(Klo) + assert_allclose(w, tgt, rtol=rtol) + # Check 'L' + w, v = np.linalg.eigh(Klo, UPLO='L') + assert_allclose(w, tgt, rtol=rtol) + # Check 'l' + w, v = np.linalg.eigh(Klo, UPLO='l') + assert_allclose(w, tgt, rtol=rtol) + # Check 'U' + w, v = np.linalg.eigh(Kup, UPLO='U') + assert_allclose(w, tgt, rtol=rtol) + # Check 'u' + w, v = np.linalg.eigh(Kup, UPLO='u') + assert_allclose(w, tgt, rtol=rtol) + + def test_0_size(self): + # Check that all kinds of 0-sized arrays work + class ArraySubclass(np.ndarray): + pass + a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass) + res, res_v = linalg.eigh(a) + assert_(res_v.dtype.type is np.float64) + assert_(res.dtype.type is np.float64) + assert_equal(a.shape, res_v.shape) + assert_equal((0, 1), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(a, np.ndarray)) + + a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass) + res, res_v = linalg.eigh(a) + assert_(res_v.dtype.type is np.complex64) + assert_(res.dtype.type is np.float32) + assert_equal(a.shape, res_v.shape) + assert_equal((0,), res.shape) + # This is just for documentation, it might make sense to change: + assert_(isinstance(a, np.ndarray)) + + +class _TestNormBase: + dt = None + dec = None + + @staticmethod + def check_dtype(x, res): + if issubclass(x.dtype.type, np.inexact): + assert_equal(res.dtype, x.real.dtype) + else: + # For integer input, don't have to test float precision of output. + assert_(issubclass(res.dtype.type, np.floating)) + + +class _TestNormGeneral(_TestNormBase): + + def test_empty(self): + assert_equal(norm([]), 0.0) + assert_equal(norm(array([], dtype=self.dt)), 0.0) + assert_equal(norm(atleast_2d(array([], dtype=self.dt))), 0.0) + + def test_vector_return_type(self): + a = np.array([1, 0, 1]) + + exact_types = np.typecodes['AllInteger'] + inexact_types = np.typecodes['AllFloat'] + + all_types = exact_types + inexact_types + + for each_type in all_types: + at = a.astype(each_type) + + an = norm(at, -np.inf) + self.check_dtype(at, an) + assert_almost_equal(an, 0.0) + + with suppress_warnings() as sup: + sup.filter(RuntimeWarning, "divide by zero encountered") + an = norm(at, -1) + self.check_dtype(at, an) + assert_almost_equal(an, 0.0) + + an = norm(at, 0) + self.check_dtype(at, an) + assert_almost_equal(an, 2) + + an = norm(at, 1) + self.check_dtype(at, an) + assert_almost_equal(an, 2.0) + + an = norm(at, 2) + self.check_dtype(at, an) + assert_almost_equal(an, an.dtype.type(2.0)**an.dtype.type(1.0/2.0)) + + an = norm(at, 4) + self.check_dtype(at, an) + assert_almost_equal(an, an.dtype.type(2.0)**an.dtype.type(1.0/4.0)) + + an = norm(at, np.inf) + self.check_dtype(at, an) + assert_almost_equal(an, 1.0) + + def test_vector(self): + a = [1, 2, 3, 4] + b = [-1, -2, -3, -4] + c = [-1, 2, -3, 4] + + def _test(v): + np.testing.assert_almost_equal(norm(v), 30 ** 0.5, + decimal=self.dec) + np.testing.assert_almost_equal(norm(v, inf), 4.0, + decimal=self.dec) + np.testing.assert_almost_equal(norm(v, -inf), 1.0, + decimal=self.dec) + np.testing.assert_almost_equal(norm(v, 1), 10.0, + decimal=self.dec) + np.testing.assert_almost_equal(norm(v, -1), 12.0 / 25, + decimal=self.dec) + np.testing.assert_almost_equal(norm(v, 2), 30 ** 0.5, + decimal=self.dec) + np.testing.assert_almost_equal(norm(v, -2), ((205. / 144) ** -0.5), + decimal=self.dec) + np.testing.assert_almost_equal(norm(v, 0), 4, + decimal=self.dec) + + for v in (a, b, c,): + _test(v) + + for v in (array(a, dtype=self.dt), array(b, dtype=self.dt), + array(c, dtype=self.dt)): + _test(v) + + def test_axis(self): + # Vector norms. + # Compare the use of `axis` with computing the norm of each row + # or column separately. + A = array([[1, 2, 3], [4, 5, 6]], dtype=self.dt) + for order in [None, -1, 0, 1, 2, 3, np.Inf, -np.Inf]: + expected0 = [norm(A[:, k], ord=order) for k in range(A.shape[1])] + assert_almost_equal(norm(A, ord=order, axis=0), expected0) + expected1 = [norm(A[k, :], ord=order) for k in range(A.shape[0])] + assert_almost_equal(norm(A, ord=order, axis=1), expected1) + + # Matrix norms. + B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4) + nd = B.ndim + for order in [None, -2, 2, -1, 1, np.Inf, -np.Inf, 'fro']: + for axis in itertools.combinations(range(-nd, nd), 2): + row_axis, col_axis = axis + if row_axis < 0: + row_axis += nd + if col_axis < 0: + col_axis += nd + if row_axis == col_axis: + assert_raises(ValueError, norm, B, ord=order, axis=axis) + else: + n = norm(B, ord=order, axis=axis) + + # The logic using k_index only works for nd = 3. + # This has to be changed if nd is increased. + k_index = nd - (row_axis + col_axis) + if row_axis < col_axis: + expected = [norm(B[:].take(k, axis=k_index), ord=order) + for k in range(B.shape[k_index])] + else: + expected = [norm(B[:].take(k, axis=k_index).T, ord=order) + for k in range(B.shape[k_index])] + assert_almost_equal(n, expected) + + def test_keepdims(self): + A = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4) + + allclose_err = 'order {0}, axis = {1}' + shape_err = 'Shape mismatch found {0}, expected {1}, order={2}, axis={3}' + + # check the order=None, axis=None case + expected = norm(A, ord=None, axis=None) + found = norm(A, ord=None, axis=None, keepdims=True) + assert_allclose(np.squeeze(found), expected, + err_msg=allclose_err.format(None, None)) + expected_shape = (1, 1, 1) + assert_(found.shape == expected_shape, + shape_err.format(found.shape, expected_shape, None, None)) + + # Vector norms. + for order in [None, -1, 0, 1, 2, 3, np.Inf, -np.Inf]: + for k in range(A.ndim): + expected = norm(A, ord=order, axis=k) + found = norm(A, ord=order, axis=k, keepdims=True) + assert_allclose(np.squeeze(found), expected, + err_msg=allclose_err.format(order, k)) + expected_shape = list(A.shape) + expected_shape[k] = 1 + expected_shape = tuple(expected_shape) + assert_(found.shape == expected_shape, + shape_err.format(found.shape, expected_shape, order, k)) + + # Matrix norms. + for order in [None, -2, 2, -1, 1, np.Inf, -np.Inf, 'fro', 'nuc']: + for k in itertools.permutations(range(A.ndim), 2): + expected = norm(A, ord=order, axis=k) + found = norm(A, ord=order, axis=k, keepdims=True) + assert_allclose(np.squeeze(found), expected, + err_msg=allclose_err.format(order, k)) + expected_shape = list(A.shape) + expected_shape[k[0]] = 1 + expected_shape[k[1]] = 1 + expected_shape = tuple(expected_shape) + assert_(found.shape == expected_shape, + shape_err.format(found.shape, expected_shape, order, k)) + + +class _TestNorm2D(_TestNormBase): + # Define the part for 2d arrays separately, so we can subclass this + # and run the tests using np.matrix in matrixlib.tests.test_matrix_linalg. + array = np.array + + def test_matrix_empty(self): + assert_equal(norm(self.array([[]], dtype=self.dt)), 0.0) + + def test_matrix_return_type(self): + a = self.array([[1, 0, 1], [0, 1, 1]]) + + exact_types = np.typecodes['AllInteger'] + + # float32, complex64, float64, complex128 types are the only types + # allowed by `linalg`, which performs the matrix operations used + # within `norm`. + inexact_types = 'fdFD' + + all_types = exact_types + inexact_types + + for each_type in all_types: + at = a.astype(each_type) + + an = norm(at, -np.inf) + self.check_dtype(at, an) + assert_almost_equal(an, 2.0) + + with suppress_warnings() as sup: + sup.filter(RuntimeWarning, "divide by zero encountered") + an = norm(at, -1) + self.check_dtype(at, an) + assert_almost_equal(an, 1.0) + + an = norm(at, 1) + self.check_dtype(at, an) + assert_almost_equal(an, 2.0) + + an = norm(at, 2) + self.check_dtype(at, an) + assert_almost_equal(an, 3.0**(1.0/2.0)) + + an = norm(at, -2) + self.check_dtype(at, an) + assert_almost_equal(an, 1.0) + + an = norm(at, np.inf) + self.check_dtype(at, an) + assert_almost_equal(an, 2.0) + + an = norm(at, 'fro') + self.check_dtype(at, an) + assert_almost_equal(an, 2.0) + + an = norm(at, 'nuc') + self.check_dtype(at, an) + # Lower bar needed to support low precision floats. + # They end up being off by 1 in the 7th place. + np.testing.assert_almost_equal(an, 2.7320508075688772, decimal=6) + + def test_matrix_2x2(self): + A = self.array([[1, 3], [5, 7]], dtype=self.dt) + assert_almost_equal(norm(A), 84 ** 0.5) + assert_almost_equal(norm(A, 'fro'), 84 ** 0.5) + assert_almost_equal(norm(A, 'nuc'), 10.0) + assert_almost_equal(norm(A, inf), 12.0) + assert_almost_equal(norm(A, -inf), 4.0) + assert_almost_equal(norm(A, 1), 10.0) + assert_almost_equal(norm(A, -1), 6.0) + assert_almost_equal(norm(A, 2), 9.1231056256176615) + assert_almost_equal(norm(A, -2), 0.87689437438234041) + + assert_raises(ValueError, norm, A, 'nofro') + assert_raises(ValueError, norm, A, -3) + assert_raises(ValueError, norm, A, 0) + + def test_matrix_3x3(self): + # This test has been added because the 2x2 example + # happened to have equal nuclear norm and induced 1-norm. + # The 1/10 scaling factor accommodates the absolute tolerance + # used in assert_almost_equal. + A = (1 / 10) * \ + self.array([[1, 2, 3], [6, 0, 5], [3, 2, 1]], dtype=self.dt) + assert_almost_equal(norm(A), (1 / 10) * 89 ** 0.5) + assert_almost_equal(norm(A, 'fro'), (1 / 10) * 89 ** 0.5) + assert_almost_equal(norm(A, 'nuc'), 1.3366836911774836) + assert_almost_equal(norm(A, inf), 1.1) + assert_almost_equal(norm(A, -inf), 0.6) + assert_almost_equal(norm(A, 1), 1.0) + assert_almost_equal(norm(A, -1), 0.4) + assert_almost_equal(norm(A, 2), 0.88722940323461277) + assert_almost_equal(norm(A, -2), 0.19456584790481812) + + def test_bad_args(self): + # Check that bad arguments raise the appropriate exceptions. + + A = self.array([[1, 2, 3], [4, 5, 6]], dtype=self.dt) + B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4) + + # Using `axis=` or passing in a 1-D array implies vector + # norms are being computed, so also using `ord='fro'` + # or `ord='nuc'` or any other string raises a ValueError. + assert_raises(ValueError, norm, A, 'fro', 0) + assert_raises(ValueError, norm, A, 'nuc', 0) + assert_raises(ValueError, norm, [3, 4], 'fro', None) + assert_raises(ValueError, norm, [3, 4], 'nuc', None) + assert_raises(ValueError, norm, [3, 4], 'test', None) + + # Similarly, norm should raise an exception when ord is any finite + # number other than 1, 2, -1 or -2 when computing matrix norms. + for order in [0, 3]: + assert_raises(ValueError, norm, A, order, None) + assert_raises(ValueError, norm, A, order, (0, 1)) + assert_raises(ValueError, norm, B, order, (1, 2)) + + # Invalid axis + assert_raises(np.AxisError, norm, B, None, 3) + assert_raises(np.AxisError, norm, B, None, (2, 3)) + assert_raises(ValueError, norm, B, None, (0, 1, 2)) + + +class _TestNorm(_TestNorm2D, _TestNormGeneral): + pass + + +class TestNorm_NonSystematic: + + def test_longdouble_norm(self): + # Non-regression test: p-norm of longdouble would previously raise + # UnboundLocalError. + x = np.arange(10, dtype=np.longdouble) + old_assert_almost_equal(norm(x, ord=3), 12.65, decimal=2) + + def test_intmin(self): + # Non-regression test: p-norm of signed integer would previously do + # float cast and abs in the wrong order. + x = np.array([-2 ** 31], dtype=np.int32) + old_assert_almost_equal(norm(x, ord=3), 2 ** 31, decimal=5) + + def test_complex_high_ord(self): + # gh-4156 + d = np.empty((2,), dtype=np.clongdouble) + d[0] = 6 + 7j + d[1] = -6 + 7j + res = 11.615898132184 + old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=10) + d = d.astype(np.complex128) + old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=9) + d = d.astype(np.complex64) + old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=5) + + +# Separate definitions so we can use them for matrix tests. +class _TestNormDoubleBase(_TestNormBase): + dt = np.double + dec = 12 + + +class _TestNormSingleBase(_TestNormBase): + dt = np.float32 + dec = 6 + + +class _TestNormInt64Base(_TestNormBase): + dt = np.int64 + dec = 12 + + +class TestNormDouble(_TestNorm, _TestNormDoubleBase): + pass + + +class TestNormSingle(_TestNorm, _TestNormSingleBase): + pass + + +class TestNormInt64(_TestNorm, _TestNormInt64Base): + pass + + +class TestMatrixRank: + + def test_matrix_rank(self): + # Full rank matrix + assert_equal(4, matrix_rank(np.eye(4))) + # rank deficient matrix + I = np.eye(4) + I[-1, -1] = 0. + assert_equal(matrix_rank(I), 3) + # All zeros - zero rank + assert_equal(matrix_rank(np.zeros((4, 4))), 0) + # 1 dimension - rank 1 unless all 0 + assert_equal(matrix_rank([1, 0, 0, 0]), 1) + assert_equal(matrix_rank(np.zeros((4,))), 0) + # accepts array-like + assert_equal(matrix_rank([1]), 1) + # greater than 2 dimensions treated as stacked matrices + ms = np.array([I, np.eye(4), np.zeros((4,4))]) + assert_equal(matrix_rank(ms), np.array([3, 4, 0])) + # works on scalar + assert_equal(matrix_rank(1), 1) + + def test_symmetric_rank(self): + assert_equal(4, matrix_rank(np.eye(4), hermitian=True)) + assert_equal(1, matrix_rank(np.ones((4, 4)), hermitian=True)) + assert_equal(0, matrix_rank(np.zeros((4, 4)), hermitian=True)) + # rank deficient matrix + I = np.eye(4) + I[-1, -1] = 0. + assert_equal(3, matrix_rank(I, hermitian=True)) + # manually supplied tolerance + I[-1, -1] = 1e-8 + assert_equal(4, matrix_rank(I, hermitian=True, tol=0.99e-8)) + assert_equal(3, matrix_rank(I, hermitian=True, tol=1.01e-8)) + + +def test_reduced_rank(): + # Test matrices with reduced rank + rng = np.random.RandomState(20120714) + for i in range(100): + # Make a rank deficient matrix + X = rng.normal(size=(40, 10)) + X[:, 0] = X[:, 1] + X[:, 2] + # Assert that matrix_rank detected deficiency + assert_equal(matrix_rank(X), 9) + X[:, 3] = X[:, 4] + X[:, 5] + assert_equal(matrix_rank(X), 8) + + +class TestQR: + # Define the array class here, so run this on matrices elsewhere. + array = np.array + + def check_qr(self, a): + # This test expects the argument `a` to be an ndarray or + # a subclass of an ndarray of inexact type. + a_type = type(a) + a_dtype = a.dtype + m, n = a.shape + k = min(m, n) + + # mode == 'complete' + res = linalg.qr(a, mode='complete') + Q, R = res.Q, res.R + assert_(Q.dtype == a_dtype) + assert_(R.dtype == a_dtype) + assert_(isinstance(Q, a_type)) + assert_(isinstance(R, a_type)) + assert_(Q.shape == (m, m)) + assert_(R.shape == (m, n)) + assert_almost_equal(dot(Q, R), a) + assert_almost_equal(dot(Q.T.conj(), Q), np.eye(m)) + assert_almost_equal(np.triu(R), R) + + # mode == 'reduced' + q1, r1 = linalg.qr(a, mode='reduced') + assert_(q1.dtype == a_dtype) + assert_(r1.dtype == a_dtype) + assert_(isinstance(q1, a_type)) + assert_(isinstance(r1, a_type)) + assert_(q1.shape == (m, k)) + assert_(r1.shape == (k, n)) + assert_almost_equal(dot(q1, r1), a) + assert_almost_equal(dot(q1.T.conj(), q1), np.eye(k)) + assert_almost_equal(np.triu(r1), r1) + + # mode == 'r' + r2 = linalg.qr(a, mode='r') + assert_(r2.dtype == a_dtype) + assert_(isinstance(r2, a_type)) + assert_almost_equal(r2, r1) + + + @pytest.mark.parametrize(["m", "n"], [ + (3, 0), + (0, 3), + (0, 0) + ]) + def test_qr_empty(self, m, n): + k = min(m, n) + a = np.empty((m, n)) + + self.check_qr(a) + + h, tau = np.linalg.qr(a, mode='raw') + assert_equal(h.dtype, np.double) + assert_equal(tau.dtype, np.double) + assert_equal(h.shape, (n, m)) + assert_equal(tau.shape, (k,)) + + def test_mode_raw(self): + # The factorization is not unique and varies between libraries, + # so it is not possible to check against known values. Functional + # testing is a possibility, but awaits the exposure of more + # of the functions in lapack_lite. Consequently, this test is + # very limited in scope. Note that the results are in FORTRAN + # order, hence the h arrays are transposed. + a = self.array([[1, 2], [3, 4], [5, 6]], dtype=np.double) + + # Test double + h, tau = linalg.qr(a, mode='raw') + assert_(h.dtype == np.double) + assert_(tau.dtype == np.double) + assert_(h.shape == (2, 3)) + assert_(tau.shape == (2,)) + + h, tau = linalg.qr(a.T, mode='raw') + assert_(h.dtype == np.double) + assert_(tau.dtype == np.double) + assert_(h.shape == (3, 2)) + assert_(tau.shape == (2,)) + + def test_mode_all_but_economic(self): + a = self.array([[1, 2], [3, 4]]) + b = self.array([[1, 2], [3, 4], [5, 6]]) + for dt in "fd": + m1 = a.astype(dt) + m2 = b.astype(dt) + self.check_qr(m1) + self.check_qr(m2) + self.check_qr(m2.T) + + for dt in "fd": + m1 = 1 + 1j * a.astype(dt) + m2 = 1 + 1j * b.astype(dt) + self.check_qr(m1) + self.check_qr(m2) + self.check_qr(m2.T) + + def check_qr_stacked(self, a): + # This test expects the argument `a` to be an ndarray or + # a subclass of an ndarray of inexact type. + a_type = type(a) + a_dtype = a.dtype + m, n = a.shape[-2:] + k = min(m, n) + + # mode == 'complete' + q, r = linalg.qr(a, mode='complete') + assert_(q.dtype == a_dtype) + assert_(r.dtype == a_dtype) + assert_(isinstance(q, a_type)) + assert_(isinstance(r, a_type)) + assert_(q.shape[-2:] == (m, m)) + assert_(r.shape[-2:] == (m, n)) + assert_almost_equal(matmul(q, r), a) + I_mat = np.identity(q.shape[-1]) + stack_I_mat = np.broadcast_to(I_mat, + q.shape[:-2] + (q.shape[-1],)*2) + assert_almost_equal(matmul(swapaxes(q, -1, -2).conj(), q), stack_I_mat) + assert_almost_equal(np.triu(r[..., :, :]), r) + + # mode == 'reduced' + q1, r1 = linalg.qr(a, mode='reduced') + assert_(q1.dtype == a_dtype) + assert_(r1.dtype == a_dtype) + assert_(isinstance(q1, a_type)) + assert_(isinstance(r1, a_type)) + assert_(q1.shape[-2:] == (m, k)) + assert_(r1.shape[-2:] == (k, n)) + assert_almost_equal(matmul(q1, r1), a) + I_mat = np.identity(q1.shape[-1]) + stack_I_mat = np.broadcast_to(I_mat, + q1.shape[:-2] + (q1.shape[-1],)*2) + assert_almost_equal(matmul(swapaxes(q1, -1, -2).conj(), q1), + stack_I_mat) + assert_almost_equal(np.triu(r1[..., :, :]), r1) + + # mode == 'r' + r2 = linalg.qr(a, mode='r') + assert_(r2.dtype == a_dtype) + assert_(isinstance(r2, a_type)) + assert_almost_equal(r2, r1) + + @pytest.mark.parametrize("size", [ + (3, 4), (4, 3), (4, 4), + (3, 0), (0, 3)]) + @pytest.mark.parametrize("outer_size", [ + (2, 2), (2,), (2, 3, 4)]) + @pytest.mark.parametrize("dt", [ + np.single, np.double, + np.csingle, np.cdouble]) + def test_stacked_inputs(self, outer_size, size, dt): + + A = np.random.normal(size=outer_size + size).astype(dt) + B = np.random.normal(size=outer_size + size).astype(dt) + self.check_qr_stacked(A) + self.check_qr_stacked(A + 1.j*B) + + +class TestCholesky: + # TODO: are there no other tests for cholesky? + + @pytest.mark.parametrize( + 'shape', [(1, 1), (2, 2), (3, 3), (50, 50), (3, 10, 10)] + ) + @pytest.mark.parametrize( + 'dtype', (np.float32, np.float64, np.complex64, np.complex128) + ) + def test_basic_property(self, shape, dtype): + # Check A = L L^H + np.random.seed(1) + a = np.random.randn(*shape) + if np.issubdtype(dtype, np.complexfloating): + a = a + 1j*np.random.randn(*shape) + + t = list(range(len(shape))) + t[-2:] = -1, -2 + + a = np.matmul(a.transpose(t).conj(), a) + a = np.asarray(a, dtype=dtype) + + c = np.linalg.cholesky(a) + + b = np.matmul(c, c.transpose(t).conj()) + with np._no_nep50_warning(): + atol = 500 * a.shape[0] * np.finfo(dtype).eps + assert_allclose(b, a, atol=atol, err_msg=f'{shape} {dtype}\n{a}\n{c}') + + def test_0_size(self): + class ArraySubclass(np.ndarray): + pass + a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass) + res = linalg.cholesky(a) + assert_equal(a.shape, res.shape) + assert_(res.dtype.type is np.float64) + # for documentation purpose: + assert_(isinstance(res, np.ndarray)) + + a = np.zeros((1, 0, 0), dtype=np.complex64).view(ArraySubclass) + res = linalg.cholesky(a) + assert_equal(a.shape, res.shape) + assert_(res.dtype.type is np.complex64) + assert_(isinstance(res, np.ndarray)) + + +def test_byteorder_check(): + # Byte order check should pass for native order + if sys.byteorder == 'little': + native = '<' + else: + native = '>' + + for dtt in (np.float32, np.float64): + arr = np.eye(4, dtype=dtt) + n_arr = arr.newbyteorder(native) + sw_arr = arr.newbyteorder('S').byteswap() + assert_equal(arr.dtype.byteorder, '=') + for routine in (linalg.inv, linalg.det, linalg.pinv): + # Normal call + res = routine(arr) + # Native but not '=' + assert_array_equal(res, routine(n_arr)) + # Swapped + assert_array_equal(res, routine(sw_arr)) + + +@pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm") +def test_generalized_raise_multiloop(): + # It should raise an error even if the error doesn't occur in the + # last iteration of the ufunc inner loop + + invertible = np.array([[1, 2], [3, 4]]) + non_invertible = np.array([[1, 1], [1, 1]]) + + x = np.zeros([4, 4, 2, 2])[1::2] + x[...] = invertible + x[0, 0] = non_invertible + + assert_raises(np.linalg.LinAlgError, np.linalg.inv, x) + + +def test_xerbla_override(): + # Check that our xerbla has been successfully linked in. If it is not, + # the default xerbla routine is called, which prints a message to stdout + # and may, or may not, abort the process depending on the LAPACK package. + + XERBLA_OK = 255 + + try: + pid = os.fork() + except (OSError, AttributeError): + # fork failed, or not running on POSIX + pytest.skip("Not POSIX or fork failed.") + + if pid == 0: + # child; close i/o file handles + os.close(1) + os.close(0) + # Avoid producing core files. + import resource + resource.setrlimit(resource.RLIMIT_CORE, (0, 0)) + # These calls may abort. + try: + np.linalg.lapack_lite.xerbla() + except ValueError: + pass + except Exception: + os._exit(os.EX_CONFIG) + + try: + a = np.array([[1.]]) + np.linalg.lapack_lite.dorgqr( + 1, 1, 1, a, + 0, # <- invalid value + a, a, 0, 0) + except ValueError as e: + if "DORGQR parameter number 5" in str(e): + # success, reuse error code to mark success as + # FORTRAN STOP returns as success. + os._exit(XERBLA_OK) + + # Did not abort, but our xerbla was not linked in. + os._exit(os.EX_CONFIG) + else: + # parent + pid, status = os.wait() + if os.WEXITSTATUS(status) != XERBLA_OK: + pytest.skip('Numpy xerbla not linked in.') + + +@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess") +@pytest.mark.slow +def test_sdot_bug_8577(): + # Regression test that loading certain other libraries does not + # result to wrong results in float32 linear algebra. + # + # There's a bug gh-8577 on OSX that can trigger this, and perhaps + # there are also other situations in which it occurs. + # + # Do the check in a separate process. + + bad_libs = ['PyQt5.QtWidgets', 'IPython'] + + template = textwrap.dedent(""" + import sys + {before} + try: + import {bad_lib} + except ImportError: + sys.exit(0) + {after} + x = np.ones(2, dtype=np.float32) + sys.exit(0 if np.allclose(x.dot(x), 2.0) else 1) + """) + + for bad_lib in bad_libs: + code = template.format(before="import numpy as np", after="", + bad_lib=bad_lib) + subprocess.check_call([sys.executable, "-c", code]) + + # Swapped import order + code = template.format(after="import numpy as np", before="", + bad_lib=bad_lib) + subprocess.check_call([sys.executable, "-c", code]) + + +class TestMultiDot: + + def test_basic_function_with_three_arguments(self): + # multi_dot with three arguments uses a fast hand coded algorithm to + # determine the optimal order. Therefore test it separately. + A = np.random.random((6, 2)) + B = np.random.random((2, 6)) + C = np.random.random((6, 2)) + + assert_almost_equal(multi_dot([A, B, C]), A.dot(B).dot(C)) + assert_almost_equal(multi_dot([A, B, C]), np.dot(A, np.dot(B, C))) + + def test_basic_function_with_two_arguments(self): + # separate code path with two arguments + A = np.random.random((6, 2)) + B = np.random.random((2, 6)) + + assert_almost_equal(multi_dot([A, B]), A.dot(B)) + assert_almost_equal(multi_dot([A, B]), np.dot(A, B)) + + def test_basic_function_with_dynamic_programming_optimization(self): + # multi_dot with four or more arguments uses the dynamic programming + # optimization and therefore deserve a separate + A = np.random.random((6, 2)) + B = np.random.random((2, 6)) + C = np.random.random((6, 2)) + D = np.random.random((2, 1)) + assert_almost_equal(multi_dot([A, B, C, D]), A.dot(B).dot(C).dot(D)) + + def test_vector_as_first_argument(self): + # The first argument can be 1-D + A1d = np.random.random(2) # 1-D + B = np.random.random((2, 6)) + C = np.random.random((6, 2)) + D = np.random.random((2, 2)) + + # the result should be 1-D + assert_equal(multi_dot([A1d, B, C, D]).shape, (2,)) + + def test_vector_as_last_argument(self): + # The last argument can be 1-D + A = np.random.random((6, 2)) + B = np.random.random((2, 6)) + C = np.random.random((6, 2)) + D1d = np.random.random(2) # 1-D + + # the result should be 1-D + assert_equal(multi_dot([A, B, C, D1d]).shape, (6,)) + + def test_vector_as_first_and_last_argument(self): + # The first and last arguments can be 1-D + A1d = np.random.random(2) # 1-D + B = np.random.random((2, 6)) + C = np.random.random((6, 2)) + D1d = np.random.random(2) # 1-D + + # the result should be a scalar + assert_equal(multi_dot([A1d, B, C, D1d]).shape, ()) + + def test_three_arguments_and_out(self): + # multi_dot with three arguments uses a fast hand coded algorithm to + # determine the optimal order. Therefore test it separately. + A = np.random.random((6, 2)) + B = np.random.random((2, 6)) + C = np.random.random((6, 2)) + + out = np.zeros((6, 2)) + ret = multi_dot([A, B, C], out=out) + assert out is ret + assert_almost_equal(out, A.dot(B).dot(C)) + assert_almost_equal(out, np.dot(A, np.dot(B, C))) + + def test_two_arguments_and_out(self): + # separate code path with two arguments + A = np.random.random((6, 2)) + B = np.random.random((2, 6)) + out = np.zeros((6, 6)) + ret = multi_dot([A, B], out=out) + assert out is ret + assert_almost_equal(out, A.dot(B)) + assert_almost_equal(out, np.dot(A, B)) + + def test_dynamic_programming_optimization_and_out(self): + # multi_dot with four or more arguments uses the dynamic programming + # optimization and therefore deserve a separate test + A = np.random.random((6, 2)) + B = np.random.random((2, 6)) + C = np.random.random((6, 2)) + D = np.random.random((2, 1)) + out = np.zeros((6, 1)) + ret = multi_dot([A, B, C, D], out=out) + assert out is ret + assert_almost_equal(out, A.dot(B).dot(C).dot(D)) + + def test_dynamic_programming_logic(self): + # Test for the dynamic programming part + # This test is directly taken from Cormen page 376. + arrays = [np.random.random((30, 35)), + np.random.random((35, 15)), + np.random.random((15, 5)), + np.random.random((5, 10)), + np.random.random((10, 20)), + np.random.random((20, 25))] + m_expected = np.array([[0., 15750., 7875., 9375., 11875., 15125.], + [0., 0., 2625., 4375., 7125., 10500.], + [0., 0., 0., 750., 2500., 5375.], + [0., 0., 0., 0., 1000., 3500.], + [0., 0., 0., 0., 0., 5000.], + [0., 0., 0., 0., 0., 0.]]) + s_expected = np.array([[0, 1, 1, 3, 3, 3], + [0, 0, 2, 3, 3, 3], + [0, 0, 0, 3, 3, 3], + [0, 0, 0, 0, 4, 5], + [0, 0, 0, 0, 0, 5], + [0, 0, 0, 0, 0, 0]], dtype=int) + s_expected -= 1 # Cormen uses 1-based index, python does not. + + s, m = _multi_dot_matrix_chain_order(arrays, return_costs=True) + + # Only the upper triangular part (without the diagonal) is interesting. + assert_almost_equal(np.triu(s[:-1, 1:]), + np.triu(s_expected[:-1, 1:])) + assert_almost_equal(np.triu(m), np.triu(m_expected)) + + def test_too_few_input_arrays(self): + assert_raises(ValueError, multi_dot, []) + assert_raises(ValueError, multi_dot, [np.random.random((3, 3))]) + + +class TestTensorinv: + + @pytest.mark.parametrize("arr, ind", [ + (np.ones((4, 6, 8, 2)), 2), + (np.ones((3, 3, 2)), 1), + ]) + def test_non_square_handling(self, arr, ind): + with assert_raises(LinAlgError): + linalg.tensorinv(arr, ind=ind) + + @pytest.mark.parametrize("shape, ind", [ + # examples from docstring + ((4, 6, 8, 3), 2), + ((24, 8, 3), 1), + ]) + def test_tensorinv_shape(self, shape, ind): + a = np.eye(24) + a.shape = shape + ainv = linalg.tensorinv(a=a, ind=ind) + expected = a.shape[ind:] + a.shape[:ind] + actual = ainv.shape + assert_equal(actual, expected) + + @pytest.mark.parametrize("ind", [ + 0, -2, + ]) + def test_tensorinv_ind_limit(self, ind): + a = np.eye(24) + a.shape = (4, 6, 8, 3) + with assert_raises(ValueError): + linalg.tensorinv(a=a, ind=ind) + + def test_tensorinv_result(self): + # mimic a docstring example + a = np.eye(24) + a.shape = (24, 8, 3) + ainv = linalg.tensorinv(a, ind=1) + b = np.ones(24) + assert_allclose(np.tensordot(ainv, b, 1), np.linalg.tensorsolve(a, b)) + + +class TestTensorsolve: + + @pytest.mark.parametrize("a, axes", [ + (np.ones((4, 6, 8, 2)), None), + (np.ones((3, 3, 2)), (0, 2)), + ]) + def test_non_square_handling(self, a, axes): + with assert_raises(LinAlgError): + b = np.ones(a.shape[:2]) + linalg.tensorsolve(a, b, axes=axes) + + @pytest.mark.parametrize("shape", + [(2, 3, 6), (3, 4, 4, 3), (0, 3, 3, 0)], + ) + def test_tensorsolve_result(self, shape): + a = np.random.randn(*shape) + b = np.ones(a.shape[:2]) + x = np.linalg.tensorsolve(a, b) + assert_allclose(np.tensordot(a, x, axes=len(x.shape)), b) + + +def test_unsupported_commontype(): + # linalg gracefully handles unsupported type + arr = np.array([[1, -2], [2, 5]], dtype='float16') + with assert_raises_regex(TypeError, "unsupported in linalg"): + linalg.cholesky(arr) + + +#@pytest.mark.slow +#@pytest.mark.xfail(not HAS_LAPACK64, run=False, +# reason="Numpy not compiled with 64-bit BLAS/LAPACK") +#@requires_memory(free_bytes=16e9) +@pytest.mark.skip(reason="Bad memory reports lead to OOM in ci testing") +def test_blas64_dot(): + n = 2**32 + a = np.zeros([1, n], dtype=np.float32) + b = np.ones([1, 1], dtype=np.float32) + a[0,-1] = 1 + c = np.dot(b, a) + assert_equal(c[0,-1], 1) + + +@pytest.mark.xfail(not HAS_LAPACK64, + reason="Numpy not compiled with 64-bit BLAS/LAPACK") +def test_blas64_geqrf_lwork_smoketest(): + # Smoke test LAPACK geqrf lwork call with 64-bit integers + dtype = np.float64 + lapack_routine = np.linalg.lapack_lite.dgeqrf + + m = 2**32 + 1 + n = 2**32 + 1 + lda = m + + # Dummy arrays, not referenced by the lapack routine, so don't + # need to be of the right size + a = np.zeros([1, 1], dtype=dtype) + work = np.zeros([1], dtype=dtype) + tau = np.zeros([1], dtype=dtype) + + # Size query + results = lapack_routine(m, n, a, lda, tau, work, -1, 0) + assert_equal(results['info'], 0) + assert_equal(results['m'], m) + assert_equal(results['n'], m) + + # Should result to an integer of a reasonable size + lwork = int(work.item()) + assert_(2**32 < lwork < 2**42) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_regression.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..af38443a93c3b74d4344130ab7bbac0206e9f7f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_regression.py @@ -0,0 +1,145 @@ +""" Test functions for linalg module +""" +import warnings + +import numpy as np +from numpy import linalg, arange, float64, array, dot, transpose +from numpy.testing import ( + assert_, assert_raises, assert_equal, assert_array_equal, + assert_array_almost_equal, assert_array_less +) + + +class TestRegression: + + def test_eig_build(self): + # Ticket #652 + rva = array([1.03221168e+02 + 0.j, + -1.91843603e+01 + 0.j, + -6.04004526e-01 + 15.84422474j, + -6.04004526e-01 - 15.84422474j, + -1.13692929e+01 + 0.j, + -6.57612485e-01 + 10.41755503j, + -6.57612485e-01 - 10.41755503j, + 1.82126812e+01 + 0.j, + 1.06011014e+01 + 0.j, + 7.80732773e+00 + 0.j, + -7.65390898e-01 + 0.j, + 1.51971555e-15 + 0.j, + -1.51308713e-15 + 0.j]) + a = arange(13 * 13, dtype=float64) + a.shape = (13, 13) + a = a % 17 + va, ve = linalg.eig(a) + va.sort() + rva.sort() + assert_array_almost_equal(va, rva) + + def test_eigh_build(self): + # Ticket 662. + rvals = [68.60568999, 89.57756725, 106.67185574] + + cov = array([[77.70273908, 3.51489954, 15.64602427], + [3.51489954, 88.97013878, -1.07431931], + [15.64602427, -1.07431931, 98.18223512]]) + + vals, vecs = linalg.eigh(cov) + assert_array_almost_equal(vals, rvals) + + def test_svd_build(self): + # Ticket 627. + a = array([[0., 1.], [1., 1.], [2., 1.], [3., 1.]]) + m, n = a.shape + u, s, vh = linalg.svd(a) + + b = dot(transpose(u[:, n:]), a) + + assert_array_almost_equal(b, np.zeros((2, 2))) + + def test_norm_vector_badarg(self): + # Regression for #786: Frobenius norm for vectors raises + # ValueError. + assert_raises(ValueError, linalg.norm, array([1., 2., 3.]), 'fro') + + def test_lapack_endian(self): + # For bug #1482 + a = array([[5.7998084, -2.1825367], + [-2.1825367, 9.85910595]], dtype='>f8') + b = array(a, dtype=' 0.5) + assert_equal(c, 1) + assert_equal(np.linalg.matrix_rank(a), 1) + assert_array_less(1, np.linalg.norm(a, ord=2)) + + def test_norm_object_array(self): + # gh-7575 + testvector = np.array([np.array([0, 1]), 0, 0], dtype=object) + + norm = linalg.norm(testvector) + assert_array_equal(norm, [0, 1]) + assert_(norm.dtype == np.dtype('float64')) + + norm = linalg.norm(testvector, ord=1) + assert_array_equal(norm, [0, 1]) + assert_(norm.dtype != np.dtype('float64')) + + norm = linalg.norm(testvector, ord=2) + assert_array_equal(norm, [0, 1]) + assert_(norm.dtype == np.dtype('float64')) + + assert_raises(ValueError, linalg.norm, testvector, ord='fro') + assert_raises(ValueError, linalg.norm, testvector, ord='nuc') + assert_raises(ValueError, linalg.norm, testvector, ord=np.inf) + assert_raises(ValueError, linalg.norm, testvector, ord=-np.inf) + assert_raises(ValueError, linalg.norm, testvector, ord=0) + assert_raises(ValueError, linalg.norm, testvector, ord=-1) + assert_raises(ValueError, linalg.norm, testvector, ord=-2) + + testmatrix = np.array([[np.array([0, 1]), 0, 0], + [0, 0, 0]], dtype=object) + + norm = linalg.norm(testmatrix) + assert_array_equal(norm, [0, 1]) + assert_(norm.dtype == np.dtype('float64')) + + norm = linalg.norm(testmatrix, ord='fro') + assert_array_equal(norm, [0, 1]) + assert_(norm.dtype == np.dtype('float64')) + + assert_raises(TypeError, linalg.norm, testmatrix, ord='nuc') + assert_raises(ValueError, linalg.norm, testmatrix, ord=np.inf) + assert_raises(ValueError, linalg.norm, testmatrix, ord=-np.inf) + assert_raises(ValueError, linalg.norm, testmatrix, ord=0) + assert_raises(ValueError, linalg.norm, testmatrix, ord=1) + assert_raises(ValueError, linalg.norm, testmatrix, ord=-1) + assert_raises(TypeError, linalg.norm, testmatrix, ord=2) + assert_raises(TypeError, linalg.norm, testmatrix, ord=-2) + assert_raises(ValueError, linalg.norm, testmatrix, ord=3) + + def test_lstsq_complex_larger_rhs(self): + # gh-9891 + size = 20 + n_rhs = 70 + G = np.random.randn(size, size) + 1j * np.random.randn(size, size) + u = np.random.randn(size, n_rhs) + 1j * np.random.randn(size, n_rhs) + b = G.dot(u) + # This should work without segmentation fault. + u_lstsq, res, rank, sv = linalg.lstsq(G, b, rcond=None) + # check results just in case + assert_array_almost_equal(u_lstsq, u) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e256bc37c6c335a54261d34a6f802c2c7321328b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arrayterator.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arrayterator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b9646a8df2465254e5a0cb602830d95a544a550 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arrayterator.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/bitwise_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/bitwise_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9318c67f6e4973518826e566c142acc53056a33a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/bitwise_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/einsumfunc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/einsumfunc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..134247a1f999cdb040e5d2a5996ff711f72cf842 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/einsumfunc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/lib_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/lib_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e44399097320c8ff6cd068c2f34a8099b74e7b30 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/lib_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/literal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/literal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d988303bf7a1b08cd039f7ccce63429f80acd5c2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/literal.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/multiarray.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/multiarray.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..827590b1214199cc47a15733fde0a520d6d85022 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/multiarray.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/numeric.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/numeric.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f25197625ab3be8be460dab9a9e29074fd912c7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/numeric.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/simple_py3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/simple_py3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c1f8bd185a6d1c28a26fb9023529a9539ed3c0d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/simple_py3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/ufuncs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/ufuncs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..335c9515f838a20868624ce3c3b18db9689fc581 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/ufuncs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/warnings_and_errors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/warnings_and_errors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61d20ae9394b00e221d86cf2bed6143d153ffb1d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/warnings_and_errors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..454c0b2d12778ad7b7337d189653cb44b9327895 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _convolution_double_backward(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask); +} +namespace symint { + template >> + ::std::tuple _convolution_double_backward(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask); + } +} + +// aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _convolution_double_backward_symint(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, output_mask); +} +namespace symint { + template >> + ::std::tuple _convolution_double_backward(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, output_mask); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3a54c0f6c3734615a2acf372d0fc0f62d22feaad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cudnn_convolution_relu { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cudnn_convolution_relu"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); +}; + +struct TORCH_API cudnn_convolution_relu_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cudnn_convolution_relu"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4b8399254b2f4ac51ae809f800672a90268a89a7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API mkldnn_adaptive_avg_pool2d { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef output_size); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size); +}; + +struct TORCH_API mkldnn_adaptive_avg_pool2d_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..7b0eb00bdf69acc7dc935f62a4590474eca07cbd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_silu : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f948c61e0902061f7ed6cec000648b106a50cf45 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor slow_conv_transpose2d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor slow_conv_transpose2d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..26131cf94bb7acb6b525a6cb3fd3a6d548411e3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor upsample_bilinear2d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_bilinear2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.h new file mode 100644 index 0000000000000000000000000000000000000000..1bb6306e75cda733f20e34123e0f6720aefb8a06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.h @@ -0,0 +1,154 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_ANY_H__ +#define GOOGLE_PROTOBUF_ANY_H__ + +#include + +#include +#include +#include + +#include + +namespace google { +namespace protobuf { + +class FieldDescriptor; +class Message; + +namespace internal { + +extern const char kAnyFullTypeName[]; // "google.protobuf.Any". +extern const char kTypeGoogleApisComPrefix[]; // "type.googleapis.com/". +extern const char kTypeGoogleProdComPrefix[]; // "type.googleprod.com/". + +std::string GetTypeUrl(StringPiece message_name, + StringPiece type_url_prefix); + +// Helper class used to implement google::protobuf::Any. +class PROTOBUF_EXPORT AnyMetadata { + typedef ArenaStringPtr UrlType; + typedef ArenaStringPtr ValueType; + public: + // AnyMetadata does not take ownership of "type_url" and "value". + AnyMetadata(UrlType* type_url, ValueType* value); + + // Packs a message using the default type URL prefix: "type.googleapis.com". + // The resulted type URL will be "type.googleapis.com/". + template + void PackFrom(const T& message) { + InternalPackFrom(message, kTypeGoogleApisComPrefix, T::FullMessageName()); + } + + void PackFrom(const Message& message); + + // Packs a message using the given type URL prefix. The type URL will be + // constructed by concatenating the message type's full name to the prefix + // with an optional "/" separator if the prefix doesn't already end with "/". + // For example, both PackFrom(message, "type.googleapis.com") and + // PackFrom(message, "type.googleapis.com/") yield the same result type + // URL: "type.googleapis.com/". + template + void PackFrom(const T& message, StringPiece type_url_prefix) { + InternalPackFrom(message, type_url_prefix, T::FullMessageName()); + } + + void PackFrom(const Message& message, const std::string& type_url_prefix); + + // Unpacks the payload into the given message. Returns false if the message's + // type doesn't match the type specified in the type URL (i.e., the full + // name after the last "/" of the type URL doesn't match the message's actual + // full name) or parsing the payload has failed. + template + bool UnpackTo(T* message) const { + return InternalUnpackTo(T::FullMessageName(), message); + } + + bool UnpackTo(Message* message) const; + + // Checks whether the type specified in the type URL matches the given type. + // A type is considered matching if its full name matches the full name after + // the last "/" in the type URL. + template + bool Is() const { + return InternalIs(T::FullMessageName()); + } + + private: + void InternalPackFrom(const MessageLite& message, + StringPiece type_url_prefix, + StringPiece type_name); + bool InternalUnpackTo(StringPiece type_name, + MessageLite* message) const; + bool InternalIs(StringPiece type_name) const; + + UrlType* type_url_; + ValueType* value_; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(AnyMetadata); +}; + +// Get the proto type name from Any::type_url value. For example, passing +// "type.googleapis.com/rpc.QueryOrigin" will return "rpc.QueryOrigin" in +// *full_type_name. Returns false if the type_url does not have a "/" +// in the type url separating the full type name. +// +// NOTE: this function is available publicly as: +// google::protobuf::Any() // static method on the generated message type. +bool ParseAnyTypeUrl(const std::string& type_url, std::string* full_type_name); + +// Get the proto type name and prefix from Any::type_url value. For example, +// passing "type.googleapis.com/rpc.QueryOrigin" will return +// "type.googleapis.com/" in *url_prefix and "rpc.QueryOrigin" in +// *full_type_name. Returns false if the type_url does not have a "/" in the +// type url separating the full type name. +bool ParseAnyTypeUrl(const std::string& type_url, std::string* url_prefix, + std::string* full_type_name); + +// See if message is of type google.protobuf.Any, if so, return the descriptors +// for "type_url" and "value" fields. +bool GetAnyFieldDescriptors(const Message& message, + const FieldDescriptor** type_url_field, + const FieldDescriptor** value_field); + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_ANY_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/extension_set_inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/extension_set_inl.h new file mode 100644 index 0000000000000000000000000000000000000000..c8996c1c4e5022c0fe2a0e7fedbdd7241a8716b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/extension_set_inl.h @@ -0,0 +1,281 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_EXTENSION_SET_INL_H__ +#define GOOGLE_PROTOBUF_EXTENSION_SET_INL_H__ + +#include +#include +#include + +namespace google { +namespace protobuf { +namespace internal { + +template +const char* ExtensionSet::ParseFieldWithExtensionInfo( + int number, bool was_packed_on_wire, const ExtensionInfo& extension, + InternalMetadata* metadata, const char* ptr, internal::ParseContext* ctx) { + if (was_packed_on_wire) { + switch (extension.type) { +#define HANDLE_TYPE(UPPERCASE, CPP_CAMELCASE) \ + case WireFormatLite::TYPE_##UPPERCASE: \ + return internal::Packed##CPP_CAMELCASE##Parser( \ + MutableRawRepeatedField(number, extension.type, extension.is_packed, \ + extension.descriptor), \ + ptr, ctx); + HANDLE_TYPE(INT32, Int32); + HANDLE_TYPE(INT64, Int64); + HANDLE_TYPE(UINT32, UInt32); + HANDLE_TYPE(UINT64, UInt64); + HANDLE_TYPE(SINT32, SInt32); + HANDLE_TYPE(SINT64, SInt64); + HANDLE_TYPE(FIXED32, Fixed32); + HANDLE_TYPE(FIXED64, Fixed64); + HANDLE_TYPE(SFIXED32, SFixed32); + HANDLE_TYPE(SFIXED64, SFixed64); + HANDLE_TYPE(FLOAT, Float); + HANDLE_TYPE(DOUBLE, Double); + HANDLE_TYPE(BOOL, Bool); +#undef HANDLE_TYPE + + case WireFormatLite::TYPE_ENUM: + return internal::PackedEnumParserArg( + MutableRawRepeatedField(number, extension.type, extension.is_packed, + extension.descriptor), + ptr, ctx, extension.enum_validity_check.func, + extension.enum_validity_check.arg, metadata, number); + case WireFormatLite::TYPE_STRING: + case WireFormatLite::TYPE_BYTES: + case WireFormatLite::TYPE_GROUP: + case WireFormatLite::TYPE_MESSAGE: + GOOGLE_LOG(FATAL) << "Non-primitive types can't be packed."; + break; + } + } else { + switch (extension.type) { +#define HANDLE_VARINT_TYPE(UPPERCASE, CPP_CAMELCASE) \ + case WireFormatLite::TYPE_##UPPERCASE: { \ + uint64 value; \ + ptr = VarintParse(ptr, &value); \ + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); \ + if (extension.is_repeated) { \ + Add##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, \ + extension.is_packed, value, extension.descriptor); \ + } else { \ + Set##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, value, \ + extension.descriptor); \ + } \ + } break + + HANDLE_VARINT_TYPE(INT32, Int32); + HANDLE_VARINT_TYPE(INT64, Int64); + HANDLE_VARINT_TYPE(UINT32, UInt32); + HANDLE_VARINT_TYPE(UINT64, UInt64); + HANDLE_VARINT_TYPE(BOOL, Bool); +#undef HANDLE_VARINT_TYPE +#define HANDLE_SVARINT_TYPE(UPPERCASE, CPP_CAMELCASE, SIZE) \ + case WireFormatLite::TYPE_##UPPERCASE: { \ + uint64 val; \ + ptr = VarintParse(ptr, &val); \ + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); \ + auto value = WireFormatLite::ZigZagDecode##SIZE(val); \ + if (extension.is_repeated) { \ + Add##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, \ + extension.is_packed, value, extension.descriptor); \ + } else { \ + Set##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, value, \ + extension.descriptor); \ + } \ + } break + + HANDLE_SVARINT_TYPE(SINT32, Int32, 32); + HANDLE_SVARINT_TYPE(SINT64, Int64, 64); +#undef HANDLE_SVARINT_TYPE +#define HANDLE_FIXED_TYPE(UPPERCASE, CPP_CAMELCASE, CPPTYPE) \ + case WireFormatLite::TYPE_##UPPERCASE: { \ + auto value = UnalignedLoad(ptr); \ + ptr += sizeof(CPPTYPE); \ + if (extension.is_repeated) { \ + Add##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, \ + extension.is_packed, value, extension.descriptor); \ + } else { \ + Set##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, value, \ + extension.descriptor); \ + } \ + } break + + HANDLE_FIXED_TYPE(FIXED32, UInt32, uint32); + HANDLE_FIXED_TYPE(FIXED64, UInt64, uint64); + HANDLE_FIXED_TYPE(SFIXED32, Int32, int32); + HANDLE_FIXED_TYPE(SFIXED64, Int64, int64); + HANDLE_FIXED_TYPE(FLOAT, Float, float); + HANDLE_FIXED_TYPE(DOUBLE, Double, double); +#undef HANDLE_FIXED_TYPE + + case WireFormatLite::TYPE_ENUM: { + uint64 val; + ptr = VarintParse(ptr, &val); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + int value = val; + + if (!extension.enum_validity_check.func( + extension.enum_validity_check.arg, value)) { + WriteVarint(number, val, metadata->mutable_unknown_fields()); + } else if (extension.is_repeated) { + AddEnum(number, WireFormatLite::TYPE_ENUM, extension.is_packed, value, + extension.descriptor); + } else { + SetEnum(number, WireFormatLite::TYPE_ENUM, value, + extension.descriptor); + } + break; + } + + case WireFormatLite::TYPE_BYTES: + case WireFormatLite::TYPE_STRING: { + std::string* value = + extension.is_repeated + ? AddString(number, WireFormatLite::TYPE_STRING, + extension.descriptor) + : MutableString(number, WireFormatLite::TYPE_STRING, + extension.descriptor); + int size = ReadSize(&ptr); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + return ctx->ReadString(ptr, size, value); + } + + case WireFormatLite::TYPE_GROUP: { + MessageLite* value = + extension.is_repeated + ? AddMessage(number, WireFormatLite::TYPE_GROUP, + *extension.message_info.prototype, + extension.descriptor) + : MutableMessage(number, WireFormatLite::TYPE_GROUP, + *extension.message_info.prototype, + extension.descriptor); + uint32 tag = (number << 3) + WireFormatLite::WIRETYPE_START_GROUP; + return ctx->ParseGroup(value, ptr, tag); + } + + case WireFormatLite::TYPE_MESSAGE: { + MessageLite* value = + extension.is_repeated + ? AddMessage(number, WireFormatLite::TYPE_MESSAGE, + *extension.message_info.prototype, + extension.descriptor) + : MutableMessage(number, WireFormatLite::TYPE_MESSAGE, + *extension.message_info.prototype, + extension.descriptor); + return ctx->ParseMessage(value, ptr); + } + } + } + return ptr; +} + +template +const char* ExtensionSet::ParseMessageSetItemTmpl( + const char* ptr, const Msg* containing_type, + internal::InternalMetadata* metadata, internal::ParseContext* ctx) { + std::string payload; + uint32 type_id = 0; + bool payload_read = false; + while (!ctx->Done(&ptr)) { + uint32 tag = static_cast(*ptr++); + if (tag == WireFormatLite::kMessageSetTypeIdTag) { + uint64 tmp; + ptr = ParseBigVarint(ptr, &tmp); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + type_id = tmp; + if (payload_read) { + ExtensionInfo extension; + bool was_packed_on_wire; + if (!FindExtension(2, type_id, containing_type, ctx, &extension, + &was_packed_on_wire)) { + WriteLengthDelimited(type_id, payload, + metadata->mutable_unknown_fields()); + } else { + MessageLite* value = + extension.is_repeated + ? AddMessage(type_id, WireFormatLite::TYPE_MESSAGE, + *extension.message_info.prototype, + extension.descriptor) + : MutableMessage(type_id, WireFormatLite::TYPE_MESSAGE, + *extension.message_info.prototype, + extension.descriptor); + + const char* p; + // We can't use regular parse from string as we have to track + // proper recursion depth and descriptor pools. + ParseContext tmp_ctx(ctx->depth(), false, &p, payload); + tmp_ctx.data().pool = ctx->data().pool; + tmp_ctx.data().factory = ctx->data().factory; + GOOGLE_PROTOBUF_PARSER_ASSERT(value->_InternalParse(p, &tmp_ctx) && + tmp_ctx.EndedAtLimit()); + } + type_id = 0; + } + } else if (tag == WireFormatLite::kMessageSetMessageTag) { + if (type_id != 0) { + ptr = ParseFieldMaybeLazily(static_cast(type_id) * 8 + 2, ptr, + containing_type, metadata, ctx); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr); + type_id = 0; + } else { + int32 size = ReadSize(&ptr); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + ptr = ctx->ReadString(ptr, size, &payload); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + payload_read = true; + } + } else { + ptr = ReadTag(ptr - 1, &tag); + if (tag == 0 || (tag & 7) == 4) { + ctx->SetLastTag(tag); + return ptr; + } + ptr = ParseField(tag, ptr, containing_type, metadata, ctx); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + } + } + return ptr; +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#endif // GOOGLE_PROTOBUF_EXTENSION_SET_INL_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/field_mask.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/field_mask.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..d8d57d823c293171522dfef2099cf1be0b51b261 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/field_mask.pb.h @@ -0,0 +1,320 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/field_mask.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ffield_5fmask_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ffield_5fmask_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2ffield_5fmask_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2ffield_5fmask_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2ffield_5fmask_2eproto; +PROTOBUF_NAMESPACE_OPEN +class FieldMask; +class FieldMaskDefaultTypeInternal; +PROTOBUF_EXPORT extern FieldMaskDefaultTypeInternal _FieldMask_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FieldMask* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +// =================================================================== + +class PROTOBUF_EXPORT FieldMask PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FieldMask) */ { + public: + inline FieldMask() : FieldMask(nullptr) {} + virtual ~FieldMask(); + + FieldMask(const FieldMask& from); + FieldMask(FieldMask&& from) noexcept + : FieldMask() { + *this = ::std::move(from); + } + + inline FieldMask& operator=(const FieldMask& from) { + CopyFrom(from); + return *this; + } + inline FieldMask& operator=(FieldMask&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const FieldMask& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const FieldMask* internal_default_instance() { + return reinterpret_cast( + &_FieldMask_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + friend void swap(FieldMask& a, FieldMask& b) { + a.Swap(&b); + } + inline void Swap(FieldMask* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(FieldMask* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline FieldMask* New() const final { + return CreateMaybeMessage(nullptr); + } + + FieldMask* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const FieldMask& from); + void MergeFrom(const FieldMask& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(FieldMask* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.FieldMask"; + } + protected: + explicit FieldMask(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2ffield_5fmask_2eproto); + return ::descriptor_table_google_2fprotobuf_2ffield_5fmask_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kPathsFieldNumber = 1, + }; + // repeated string paths = 1; + int paths_size() const; + private: + int _internal_paths_size() const; + public: + void clear_paths(); + const std::string& paths(int index) const; + std::string* mutable_paths(int index); + void set_paths(int index, const std::string& value); + void set_paths(int index, std::string&& value); + void set_paths(int index, const char* value); + void set_paths(int index, const char* value, size_t size); + std::string* add_paths(); + void add_paths(const std::string& value); + void add_paths(std::string&& value); + void add_paths(const char* value); + void add_paths(const char* value, size_t size); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& paths() const; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* mutable_paths(); + private: + const std::string& _internal_paths(int index) const; + std::string* _internal_add_paths(); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.FieldMask) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField paths_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2ffield_5fmask_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// FieldMask + +// repeated string paths = 1; +inline int FieldMask::_internal_paths_size() const { + return paths_.size(); +} +inline int FieldMask::paths_size() const { + return _internal_paths_size(); +} +inline void FieldMask::clear_paths() { + paths_.Clear(); +} +inline std::string* FieldMask::add_paths() { + // @@protoc_insertion_point(field_add_mutable:google.protobuf.FieldMask.paths) + return _internal_add_paths(); +} +inline const std::string& FieldMask::_internal_paths(int index) const { + return paths_.Get(index); +} +inline const std::string& FieldMask::paths(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.FieldMask.paths) + return _internal_paths(index); +} +inline std::string* FieldMask::mutable_paths(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.FieldMask.paths) + return paths_.Mutable(index); +} +inline void FieldMask::set_paths(int index, const std::string& value) { + // @@protoc_insertion_point(field_set:google.protobuf.FieldMask.paths) + paths_.Mutable(index)->assign(value); +} +inline void FieldMask::set_paths(int index, std::string&& value) { + // @@protoc_insertion_point(field_set:google.protobuf.FieldMask.paths) + paths_.Mutable(index)->assign(std::move(value)); +} +inline void FieldMask::set_paths(int index, const char* value) { + GOOGLE_DCHECK(value != nullptr); + paths_.Mutable(index)->assign(value); + // @@protoc_insertion_point(field_set_char:google.protobuf.FieldMask.paths) +} +inline void FieldMask::set_paths(int index, const char* value, size_t size) { + paths_.Mutable(index)->assign( + reinterpret_cast(value), size); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldMask.paths) +} +inline std::string* FieldMask::_internal_add_paths() { + return paths_.Add(); +} +inline void FieldMask::add_paths(const std::string& value) { + paths_.Add()->assign(value); + // @@protoc_insertion_point(field_add:google.protobuf.FieldMask.paths) +} +inline void FieldMask::add_paths(std::string&& value) { + paths_.Add(std::move(value)); + // @@protoc_insertion_point(field_add:google.protobuf.FieldMask.paths) +} +inline void FieldMask::add_paths(const char* value) { + GOOGLE_DCHECK(value != nullptr); + paths_.Add()->assign(value); + // @@protoc_insertion_point(field_add_char:google.protobuf.FieldMask.paths) +} +inline void FieldMask::add_paths(const char* value, size_t size) { + paths_.Add()->assign(reinterpret_cast(value), size); + // @@protoc_insertion_point(field_add_pointer:google.protobuf.FieldMask.paths) +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField& +FieldMask::paths() const { + // @@protoc_insertion_point(field_list:google.protobuf.FieldMask.paths) + return paths_; +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField* +FieldMask::mutable_paths() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.FieldMask.paths) + return &paths_; +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ffield_5fmask_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_table_driven.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_table_driven.h new file mode 100644 index 0000000000000000000000000000000000000000..7165fd31b5927217ed2628ca6044a2ffc41e5274 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_table_driven.h @@ -0,0 +1,344 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DRIVEN_H__ +#define GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DRIVEN_H__ + +#include +#include +#include +#include +#include + +// We require C++11 and Clang to use constexpr for variables, as GCC 4.8 +// requires constexpr to be consistent between declarations of variables +// unnecessarily (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58541). +// VS 2017 Update 3 also supports this usage of constexpr. +#if defined(__clang__) || (defined(_MSC_VER) && _MSC_VER >= 1911) +#define PROTOBUF_CONSTEXPR_VAR constexpr +#else // !__clang__ +#define PROTOBUF_CONSTEXPR_VAR +#endif // !_clang + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +#include + +namespace google { +namespace protobuf { +namespace internal { + +// Processing-type masks. +static constexpr const unsigned char kOneofMask = 0x40; +static constexpr const unsigned char kRepeatedMask = 0x20; +// Mask for the raw type: either a WireFormatLite::FieldType or one of the +// ProcessingTypes below, without the oneof or repeated flag. +static constexpr const unsigned char kTypeMask = 0x1f; + +// Wire type masks. +static constexpr const unsigned char kNotPackedMask = 0x10; +static constexpr const unsigned char kInvalidMask = 0x20; + +enum ProcessingTypes { + TYPE_STRING_CORD = 19, + TYPE_STRING_STRING_PIECE = 20, + TYPE_BYTES_CORD = 21, + TYPE_BYTES_STRING_PIECE = 22, + TYPE_STRING_INLINED = 23, + TYPE_BYTES_INLINED = 24, + TYPE_MAP = 25, +}; + +static_assert(TYPE_MAP < kRepeatedMask, "Invalid enum"); + +struct PROTOBUF_EXPORT FieldMetadata { + uint32 offset; // offset of this field in the struct + uint32 tag; // field * 8 + wire_type + // byte offset * 8 + bit_offset; + // if the high bit is set then this is the byte offset of the oneof_case + // for this field. + uint32 has_offset; + uint32 type; // the type of this field. + const void* ptr; // auxiliary data + + // From the serializer point of view each fundamental type can occur in + // 4 different ways. For simplicity we treat all combinations as a cartesion + // product although not all combinations are allowed. + enum FieldTypeClass { + kPresence, + kNoPresence, + kRepeated, + kPacked, + kOneOf, + kNumTypeClasses // must be last enum + }; + // C++ protobuf has 20 fundamental types, were we added Cord and StringPiece + // and also distinquish the same types if they have different wire format. + enum { + kCordType = 19, + kStringPieceType = 20, + kInlinedType = 21, + kNumTypes = 21, + kSpecial = kNumTypes * kNumTypeClasses, + }; + + static int CalculateType(int fundamental_type, FieldTypeClass type_class); +}; + +// TODO(ckennelly): Add a static assertion to ensure that these masks do not +// conflict with wiretypes. + +// ParseTableField is kept small to help simplify instructions for computing +// offsets, as we will always need this information to parse a field. +// Additional data, needed for some types, is stored in +// AuxiliaryParseTableField. +struct ParseTableField { + uint32 offset; + // The presence_index ordinarily represents a has_bit index, but for fields + // inside a oneof it represents the index in _oneof_case_. + uint32 presence_index; + unsigned char normal_wiretype; + unsigned char packed_wiretype; + + // processing_type is given by: + // (FieldDescriptor->type() << 1) | FieldDescriptor->is_packed() + unsigned char processing_type; + + unsigned char tag_size; +}; + +struct ParseTable; + +union AuxiliaryParseTableField { + typedef bool (*EnumValidator)(int); + + // Enums + struct enum_aux { + EnumValidator validator; + }; + enum_aux enums; + // Group, messages + struct message_aux { + // ExplicitlyInitialized -> T requires a reinterpret_cast, which prevents + // the tables from being constructed as a constexpr. We use void to avoid + // the cast. + const void* default_message_void; + const MessageLite* default_message() const { + return static_cast(default_message_void); + } + }; + message_aux messages; + // Strings + struct string_aux { + const void* default_ptr; + const char* field_name; + }; + string_aux strings; + + struct map_aux { + bool (*parse_map)(io::CodedInputStream*, void*); + }; + map_aux maps; + + AuxiliaryParseTableField() = default; + constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::enum_aux e) + : enums(e) {} + constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::message_aux m) + : messages(m) {} + constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::string_aux s) + : strings(s) {} + constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::map_aux m) + : maps(m) {} +}; + +struct ParseTable { + const ParseTableField* fields; + const AuxiliaryParseTableField* aux; + int max_field_number; + // TODO(ckennelly): Do something with this padding. + + // TODO(ckennelly): Vet these for sign extension. + int64 has_bits_offset; + int64 oneof_case_offset; + int64 extension_offset; + int64 arena_offset; + + // ExplicitlyInitialized -> T requires a reinterpret_cast, which prevents + // the tables from being constructed as a constexpr. We use void to avoid + // the cast. + const void* default_instance_void; + const MessageLite* default_instance() const { + return static_cast(default_instance_void); + } + + bool unknown_field_set; +}; + +static_assert(sizeof(ParseTableField) <= 16, "ParseTableField is too large"); +// The tables must be composed of POD components to ensure link-time +// initialization. +static_assert(std::is_pod::value, ""); +static_assert(std::is_pod::value, ""); +static_assert(std::is_pod::value, ""); +static_assert(std::is_pod::value, ""); +static_assert(std::is_pod::value, ""); +static_assert(std::is_pod::value, ""); + +// TODO(ckennelly): Consolidate these implementations into a single one, using +// dynamic dispatch to the appropriate unknown field handler. +bool MergePartialFromCodedStream(MessageLite* msg, const ParseTable& table, + io::CodedInputStream* input); +bool MergePartialFromCodedStreamLite(MessageLite* msg, const ParseTable& table, + io::CodedInputStream* input); + +template +bool ParseMap(io::CodedInputStream* input, void* map_field) { + typedef typename MapEntryToMapField::MapFieldType MapFieldType; + typedef Map + MapType; + typedef typename Entry::template Parser ParserType; + + ParserType parser(static_cast(map_field)); + return WireFormatLite::ReadMessageNoVirtual(input, &parser); +} + +struct SerializationTable { + int num_fields; + const FieldMetadata* field_table; +}; + +PROTOBUF_EXPORT void SerializeInternal(const uint8* base, + const FieldMetadata* table, + int32 num_fields, + io::CodedOutputStream* output); + +inline void TableSerialize(const MessageLite& msg, + const SerializationTable* table, + io::CodedOutputStream* output) { + const FieldMetadata* field_table = table->field_table; + int num_fields = table->num_fields - 1; + const uint8* base = reinterpret_cast(&msg); + // TODO(gerbens) This skips the first test if we could use the fast + // array serialization path, we should make this + // int cached_size = + // *reinterpret_cast(base + field_table->offset); + // SerializeWithCachedSize(msg, field_table + 1, num_fields, cached_size, ...) + // But we keep conformance with the old way for now. + SerializeInternal(base, field_table + 1, num_fields, output); +} + +uint8* SerializeInternalToArray(const uint8* base, const FieldMetadata* table, + int32 num_fields, bool is_deterministic, + uint8* buffer); + +inline uint8* TableSerializeToArray(const MessageLite& msg, + const SerializationTable* table, + bool is_deterministic, uint8* buffer) { + const uint8* base = reinterpret_cast(&msg); + const FieldMetadata* field_table = table->field_table + 1; + int num_fields = table->num_fields - 1; + return SerializeInternalToArray(base, field_table, num_fields, + is_deterministic, buffer); +} + +template +struct CompareHelper { + bool operator()(const T& a, const T& b) const { return a < b; } +}; + +template <> +struct CompareHelper { + bool operator()(const ArenaStringPtr& a, const ArenaStringPtr& b) const { + return a.Get() < b.Get(); + } +}; + +struct CompareMapKey { + template + bool operator()(const MapEntryHelper& a, + const MapEntryHelper& b) const { + return Compare(a.key_, b.key_); + } + template + bool Compare(const T& a, const T& b) const { + return CompareHelper()(a, b); + } +}; + +template +void MapFieldSerializer(const uint8* base, uint32 offset, uint32 tag, + uint32 has_offset, io::CodedOutputStream* output) { + typedef MapEntryHelper Entry; + typedef typename MapFieldType::MapType::const_iterator Iter; + + const MapFieldType& map_field = + *reinterpret_cast(base + offset); + const SerializationTable* t = + table + + has_offset; // has_offset is overloaded for maps to mean table offset + if (!output->IsSerializationDeterministic()) { + for (Iter it = map_field.GetMap().begin(); it != map_field.GetMap().end(); + ++it) { + Entry map_entry(*it); + output->WriteVarint32(tag); + output->WriteVarint32(map_entry._cached_size_); + SerializeInternal(reinterpret_cast(&map_entry), + t->field_table, t->num_fields, output); + } + } else { + std::vector v; + for (Iter it = map_field.GetMap().begin(); it != map_field.GetMap().end(); + ++it) { + v.push_back(Entry(*it)); + } + std::sort(v.begin(), v.end(), CompareMapKey()); + for (int i = 0; i < v.size(); i++) { + output->WriteVarint32(tag); + output->WriteVarint32(v[i]._cached_size_); + SerializeInternal(reinterpret_cast(&v[i]), t->field_table, + t->num_fields, output); + } + } +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DRIVEN_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/implicit_weak_message.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/implicit_weak_message.h new file mode 100644 index 0000000000000000000000000000000000000000..d373a520a14bf5ab8f0bd6a354e84d0a0f1580ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/implicit_weak_message.h @@ -0,0 +1,195 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_IMPLICIT_WEAK_MESSAGE_H__ +#define GOOGLE_PROTOBUF_IMPLICIT_WEAK_MESSAGE_H__ + +#include + +#include +#include +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +#include + +// This file is logically internal-only and should only be used by protobuf +// generated code. + +namespace google { +namespace protobuf { +namespace internal { + +// An implementation of MessageLite that treats all data as unknown. This type +// acts as a placeholder for an implicit weak field in the case where the true +// message type does not get linked into the binary. +class PROTOBUF_EXPORT ImplicitWeakMessage : public MessageLite { + public: + ImplicitWeakMessage() {} + explicit ImplicitWeakMessage(Arena* arena) : MessageLite(arena) {} + + static const ImplicitWeakMessage* default_instance(); + + std::string GetTypeName() const override { return ""; } + + MessageLite* New() const override { return new ImplicitWeakMessage; } + MessageLite* New(Arena* arena) const override { + return Arena::CreateMessage(arena); + } + + void Clear() override { data_.clear(); } + + bool IsInitialized() const override { return true; } + + void CheckTypeAndMergeFrom(const MessageLite& other) override { + data_.append(static_cast(other).data_); + } + + const char* _InternalParse(const char* ptr, ParseContext* ctx) final; + + size_t ByteSizeLong() const override { return data_.size(); } + + uint8* _InternalSerialize(uint8* target, + io::EpsCopyOutputStream* stream) const final { + return stream->WriteRaw(data_.data(), static_cast(data_.size()), + target); + } + + int GetCachedSize() const override { return static_cast(data_.size()); } + + typedef void InternalArenaConstructable_; + + private: + std::string data_; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ImplicitWeakMessage); +}; + +// A type handler for use with implicit weak repeated message fields. +template +class ImplicitWeakTypeHandler { + public: + typedef MessageLite Type; + static constexpr bool Moveable = false; + + static inline MessageLite* NewFromPrototype(const MessageLite* prototype, + Arena* arena = NULL) { + return prototype->New(arena); + } + + static inline void Delete(MessageLite* value, Arena* arena) { + if (arena == NULL) { + delete value; + } + } + static inline Arena* GetArena(MessageLite* value) { + return value->GetArena(); + } + static inline void* GetMaybeArenaPointer(MessageLite* value) { + return value->GetArena(); + } + static inline void Clear(MessageLite* value) { value->Clear(); } + static void Merge(const MessageLite& from, MessageLite* to) { + to->CheckTypeAndMergeFrom(from); + } +}; + +} // namespace internal + +template +struct WeakRepeatedPtrField { + using TypeHandler = internal::ImplicitWeakTypeHandler; + WeakRepeatedPtrField() : weak() {} + explicit WeakRepeatedPtrField(Arena* arena) : weak(arena) {} + ~WeakRepeatedPtrField() { weak.template Destroy(); } + + typedef internal::RepeatedPtrIterator iterator; + typedef internal::RepeatedPtrIterator const_iterator; + typedef internal::RepeatedPtrOverPtrsIterator + pointer_iterator; + typedef internal::RepeatedPtrOverPtrsIterator + const_pointer_iterator; + + iterator begin() { return iterator(base().raw_data()); } + const_iterator begin() const { return iterator(base().raw_data()); } + const_iterator cbegin() const { return begin(); } + iterator end() { return begin() + base().size(); } + const_iterator end() const { return begin() + base().size(); } + const_iterator cend() const { return end(); } + pointer_iterator pointer_begin() { + return pointer_iterator(base().raw_mutable_data()); + } + const_pointer_iterator pointer_begin() const { + return const_pointer_iterator(base().raw_mutable_data()); + } + pointer_iterator pointer_end() { + return pointer_iterator(base().raw_mutable_data() + base().size()); + } + const_pointer_iterator pointer_end() const { + return const_pointer_iterator(base().raw_mutable_data() + base().size()); + } + + MessageLite* AddWeak(const MessageLite* prototype) { + return base().AddWeak(prototype); + } + T* Add() { return weak.Add(); } + void Clear() { base().template Clear(); } + void MergeFrom(const WeakRepeatedPtrField& other) { + base().template MergeFrom(other.base()); + } + void InternalSwap(WeakRepeatedPtrField* other) { + base().InternalSwap(&other->base()); + } + + const internal::RepeatedPtrFieldBase& base() const { return weak; } + internal::RepeatedPtrFieldBase& base() { return weak; } + // Union disables running the destructor. Which would create a strong link. + // Instead we explicitly destroy the underlying base through the virtual + // destructor. + union { + RepeatedPtrField weak; + }; +}; + +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_IMPLICIT_WEAK_MESSAGE_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_entry_lite.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_entry_lite.h new file mode 100644 index 0000000000000000000000000000000000000000..eb1a33b1937b979d09dc38664c1c937910a9f62d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_entry_lite.h @@ -0,0 +1,676 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_MAP_ENTRY_LITE_H__ +#define GOOGLE_PROTOBUF_MAP_ENTRY_LITE_H__ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { +namespace internal { +template +class MapEntry; +template +class MapFieldLite; +} // namespace internal +} // namespace protobuf +} // namespace google + +namespace google { +namespace protobuf { +namespace internal { + +// MoveHelper::Move is used to set *dest. It copies *src, or moves it (in +// the C++11 sense), or swaps it. *src is left in a sane state for +// subsequent destruction, but shouldn't be used for anything. +template +struct MoveHelper { // primitives + static void Move(T* src, T* dest) { *dest = *src; } +}; + +template +struct MoveHelper { // enums + static void Move(T* src, T* dest) { *dest = *src; } + // T is an enum here, so allow conversions to and from int. + static void Move(T* src, int* dest) { *dest = static_cast(*src); } + static void Move(int* src, T* dest) { *dest = static_cast(*src); } +}; + +template +struct MoveHelper { // messages + static void Move(T* src, T* dest) { dest->Swap(src); } +}; + +template +struct MoveHelper { // strings and similar + static void Move(T* src, T* dest) { +#if __cplusplus >= 201103L + *dest = std::move(*src); +#else + dest->swap(*src); +#endif + } +}; + +// Functions for operating on a map entry. Does not contain any representation +// (this class is not intended to be instantiated). +template +struct MapEntryFuncs { + typedef MapTypeHandler KeyTypeHandler; + typedef MapTypeHandler ValueTypeHandler; + static const int kKeyFieldNumber = 1; + static const int kValueFieldNumber = 2; + + static uint8* InternalSerialize(int field_number, const Key& key, + const Value& value, uint8* ptr, + io::EpsCopyOutputStream* stream) { + ptr = stream->EnsureSpace(ptr); + ptr = WireFormatLite::WriteTagToArray( + field_number, WireFormatLite::WIRETYPE_LENGTH_DELIMITED, ptr); + ptr = io::CodedOutputStream::WriteVarint32ToArray(GetCachedSize(key, value), + ptr); + + ptr = KeyTypeHandler::Write(kKeyFieldNumber, key, ptr, stream); + return ValueTypeHandler::Write(kValueFieldNumber, value, ptr, stream); + } + + static size_t ByteSizeLong(const Key& key, const Value& value) { + // Tags for key and value will both be one byte (field numbers 1 and 2). + size_t inner_length = + 2 + KeyTypeHandler::ByteSize(key) + ValueTypeHandler::ByteSize(value); + return inner_length + io::CodedOutputStream::VarintSize32( + static_cast(inner_length)); + } + + static int GetCachedSize(const Key& key, const Value& value) { + // Tags for key and value will both be one byte (field numbers 1 and 2). + return 2 + KeyTypeHandler::GetCachedSize(key) + + ValueTypeHandler::GetCachedSize(value); + } +}; + +// MapEntryImpl is used to implement parsing and serialization of map entries. +// It uses Curious Recursive Template Pattern (CRTP) to provide the type of +// the eventual code to the template code. +template +class MapEntryImpl : public Base { + public: + typedef MapEntryFuncs Funcs; + + protected: + // Provide utilities to parse/serialize key/value. Provide utilities to + // manipulate internal stored type. + typedef MapTypeHandler KeyTypeHandler; + typedef MapTypeHandler ValueTypeHandler; + + // Define internal memory layout. Strings and messages are stored as + // pointers, while other types are stored as values. + typedef typename KeyTypeHandler::TypeOnMemory KeyOnMemory; + typedef typename ValueTypeHandler::TypeOnMemory ValueOnMemory; + + // Enum type cannot be used for MapTypeHandler::Read. Define a type + // which will replace Enum with int. + typedef typename KeyTypeHandler::MapEntryAccessorType KeyMapEntryAccessorType; + typedef + typename ValueTypeHandler::MapEntryAccessorType ValueMapEntryAccessorType; + + // Constants for field number. + static const int kKeyFieldNumber = 1; + static const int kValueFieldNumber = 2; + + // Constants for field tag. + static const uint8 kKeyTag = + GOOGLE_PROTOBUF_WIRE_FORMAT_MAKE_TAG(kKeyFieldNumber, KeyTypeHandler::kWireType); + static const uint8 kValueTag = GOOGLE_PROTOBUF_WIRE_FORMAT_MAKE_TAG( + kValueFieldNumber, ValueTypeHandler::kWireType); + static const size_t kTagSize = 1; + + public: + // Work-around for a compiler bug (see repeated_field.h). + typedef void MapEntryHasMergeTypeTrait; + typedef Derived EntryType; + typedef Key EntryKeyType; + typedef Value EntryValueType; + static const WireFormatLite::FieldType kEntryKeyFieldType = kKeyFieldType; + static const WireFormatLite::FieldType kEntryValueFieldType = kValueFieldType; + static const int kEntryDefaultEnumValue = default_enum_value; + + MapEntryImpl() { + KeyTypeHandler::Initialize(&key_, NULL); + ValueTypeHandler::InitializeMaybeByDefaultEnum(&value_, default_enum_value, + NULL); + _has_bits_[0] = 0; + } + + explicit MapEntryImpl(Arena* arena) : Base(arena) { + KeyTypeHandler::Initialize(&key_, arena); + ValueTypeHandler::InitializeMaybeByDefaultEnum(&value_, default_enum_value, + arena); + _has_bits_[0] = 0; + } + + ~MapEntryImpl() { + if (Base::GetArena() != NULL) return; + KeyTypeHandler::DeleteNoArena(key_); + ValueTypeHandler::DeleteNoArena(value_); + } + + // accessors ====================================================== + + virtual inline const KeyMapEntryAccessorType& key() const { + return KeyTypeHandler::GetExternalReference(key_); + } + virtual inline const ValueMapEntryAccessorType& value() const { + return ValueTypeHandler::DefaultIfNotInitialized( + value_, Derived::internal_default_instance()->value_); + } + inline KeyMapEntryAccessorType* mutable_key() { + set_has_key(); + return KeyTypeHandler::EnsureMutable(&key_, Base::GetArena()); + } + inline ValueMapEntryAccessorType* mutable_value() { + set_has_value(); + return ValueTypeHandler::EnsureMutable(&value_, Base::GetArena()); + } + + // implements MessageLite ========================================= + + // MapEntryImpl is for implementation only and this function isn't called + // anywhere. Just provide a fake implementation here for MessageLite. + std::string GetTypeName() const override { return ""; } + + void CheckTypeAndMergeFrom(const MessageLite& other) override { + MergeFromInternal(*::google::protobuf::internal::DownCast(&other)); + } + + const char* _InternalParse(const char* ptr, ParseContext* ctx) final { + while (!ctx->Done(&ptr)) { + uint32 tag; + ptr = ReadTag(ptr, &tag); + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + if (tag == kKeyTag) { + set_has_key(); + KeyMapEntryAccessorType* key = mutable_key(); + ptr = KeyTypeHandler::Read(ptr, ctx, key); + if (!Derived::ValidateKey(key)) return nullptr; + } else if (tag == kValueTag) { + set_has_value(); + ValueMapEntryAccessorType* value = mutable_value(); + ptr = ValueTypeHandler::Read(ptr, ctx, value); + if (!Derived::ValidateValue(value)) return nullptr; + } else { + if (tag == 0 || WireFormatLite::GetTagWireType(tag) == + WireFormatLite::WIRETYPE_END_GROUP) { + ctx->SetLastTag(tag); + return ptr; + } + ptr = UnknownFieldParse(tag, static_cast(nullptr), ptr, + ctx); + } + GOOGLE_PROTOBUF_PARSER_ASSERT(ptr); + } + return ptr; + } + + size_t ByteSizeLong() const override { + size_t size = 0; + size += kTagSize + static_cast(KeyTypeHandler::ByteSize(key())); + size += kTagSize + static_cast(ValueTypeHandler::ByteSize(value())); + return size; + } + + ::google::protobuf::uint8* _InternalSerialize(::google::protobuf::uint8* ptr, + io::EpsCopyOutputStream* stream) const override { + ptr = KeyTypeHandler::Write(kKeyFieldNumber, key(), ptr, stream); + return ValueTypeHandler::Write(kValueFieldNumber, value(), ptr, stream); + } + + // Don't override SerializeWithCachedSizesToArray. Use MessageLite's. + + int GetCachedSize() const override { + int size = 0; + size += has_key() ? static_cast(kTagSize) + + KeyTypeHandler::GetCachedSize(key()) + : 0; + size += has_value() ? static_cast(kTagSize) + + ValueTypeHandler::GetCachedSize(value()) + : 0; + return size; + } + + bool IsInitialized() const override { + return ValueTypeHandler::IsInitialized(value_); + } + + Base* New() const override { + Derived* entry = new Derived; + return entry; + } + + Base* New(Arena* arena) const override { + Derived* entry = Arena::CreateMessage(arena); + return entry; + } + + protected: + // We can't declare this function directly here as it would hide the other + // overload (const Message&). + void MergeFromInternal(const MapEntryImpl& from) { + if (from._has_bits_[0]) { + if (from.has_key()) { + KeyTypeHandler::EnsureMutable(&key_, Base::GetArena()); + KeyTypeHandler::Merge(from.key(), &key_, Base::GetArena()); + set_has_key(); + } + if (from.has_value()) { + ValueTypeHandler::EnsureMutable(&value_, Base::GetArena()); + ValueTypeHandler::Merge(from.value(), &value_, Base::GetArena()); + set_has_value(); + } + } + } + + public: + void Clear() override { + KeyTypeHandler::Clear(&key_, Base::GetArena()); + ValueTypeHandler::ClearMaybeByDefaultEnum(&value_, Base::GetArena(), + default_enum_value); + clear_has_key(); + clear_has_value(); + } + + static void InitAsDefaultInstance() { + Derived* d = const_cast(Derived::internal_default_instance()); + KeyTypeHandler::AssignDefaultValue(&d->key_); + ValueTypeHandler::AssignDefaultValue(&d->value_); + } + + // Parsing using MergePartialFromCodedStream, above, is not as + // efficient as it could be. This helper class provides a speedier way. + template + class Parser { + public: + explicit Parser(MapField* mf) : mf_(mf), map_(mf->MutableMap()) {} + ~Parser() { + if (entry_ != nullptr && entry_->GetArena() == nullptr) delete entry_; + } + + // This does what the typical MergePartialFromCodedStream() is expected to + // do, with the additional side-effect that if successful (i.e., if true is + // going to be its return value) it inserts the key-value pair into map_. + bool MergePartialFromCodedStream(io::CodedInputStream* input) { + // Look for the expected thing: a key and then a value. If it fails, + // invoke the enclosing class's MergePartialFromCodedStream, or return + // false if that would be pointless. + if (input->ExpectTag(kKeyTag)) { + if (!KeyTypeHandler::Read(input, &key_)) { + return false; + } + // Peek at the next byte to see if it is kValueTag. If not, bail out. + const void* data; + int size; + input->GetDirectBufferPointerInline(&data, &size); + // We could use memcmp here, but we don't bother. The tag is one byte. + static_assert(kTagSize == 1, "tag size must be 1"); + if (size > 0 && *reinterpret_cast(data) == kValueTag) { + typename Map::size_type map_size = map_->size(); + value_ptr_ = &(*map_)[key_]; + if (PROTOBUF_PREDICT_TRUE(map_size != map_->size())) { + // We created a new key-value pair. Fill in the value. + typedef + typename MapIf::type T; + input->Skip(kTagSize); // Skip kValueTag. + if (!ValueTypeHandler::Read(input, + reinterpret_cast(value_ptr_))) { + map_->erase(key_); // Failure! Undo insertion. + return false; + } + if (input->ExpectAtEnd()) return true; + return ReadBeyondKeyValuePair(input); + } + } + } else { + key_ = Key(); + } + + NewEntry(); + *entry_->mutable_key() = key_; + const bool result = entry_->MergePartialFromCodedStream(input); + if (result) UseKeyAndValueFromEntry(); + return result; + } + + const char* _InternalParse(const char* ptr, ParseContext* ctx) { + if (PROTOBUF_PREDICT_TRUE(!ctx->Done(&ptr) && *ptr == kKeyTag)) { + ptr = KeyTypeHandler::Read(ptr + 1, ctx, &key_); + if (PROTOBUF_PREDICT_FALSE(!ptr || !Derived::ValidateKey(&key_))) { + return nullptr; + } + if (PROTOBUF_PREDICT_TRUE(!ctx->Done(&ptr) && *ptr == kValueTag)) { + typename Map::size_type map_size = map_->size(); + value_ptr_ = &(*map_)[key_]; + if (PROTOBUF_PREDICT_TRUE(map_size != map_->size())) { + using T = + typename MapIf::type; + ptr = ValueTypeHandler::Read(ptr + 1, ctx, + reinterpret_cast(value_ptr_)); + if (PROTOBUF_PREDICT_FALSE(!ptr || + !Derived::ValidateValue(value_ptr_))) { + map_->erase(key_); // Failure! Undo insertion. + return nullptr; + } + if (PROTOBUF_PREDICT_TRUE(ctx->Done(&ptr))) return ptr; + if (!ptr) return nullptr; + NewEntry(); + ValueMover::Move(value_ptr_, entry_->mutable_value()); + map_->erase(key_); + goto move_key; + } + } else { + if (!ptr) return nullptr; + } + NewEntry(); + move_key: + KeyMover::Move(&key_, entry_->mutable_key()); + } else { + if (!ptr) return nullptr; + NewEntry(); + } + ptr = entry_->_InternalParse(ptr, ctx); + if (ptr) UseKeyAndValueFromEntry(); + return ptr; + } + + template + const char* ParseWithEnumValidation(const char* ptr, ParseContext* ctx, + bool (*is_valid)(int), uint32 field_num, + InternalMetadata* metadata) { + auto entry = NewEntry(); + ptr = entry->_InternalParse(ptr, ctx); + if (!ptr) return nullptr; + if (is_valid(entry->value())) { + UseKeyAndValueFromEntry(); + } else { + WriteLengthDelimited(field_num, entry->SerializeAsString(), + metadata->mutable_unknown_fields()); + } + return ptr; + } + + MapEntryImpl* NewEntry() { return entry_ = mf_->NewEntry(); } + + const Key& key() const { return key_; } + const Value& value() const { return *value_ptr_; } + + const Key& entry_key() const { return entry_->key(); } + const Value& entry_value() const { return entry_->value(); } + + private: + void UseKeyAndValueFromEntry() { + // Update key_ in case we need it later (because key() is called). + // This is potentially inefficient, especially if the key is + // expensive to copy (e.g., a long string), but this is a cold + // path, so it's not a big deal. + key_ = entry_->key(); + value_ptr_ = &(*map_)[key_]; + ValueMover::Move(entry_->mutable_value(), value_ptr_); + } + + // After reading a key and value successfully, and inserting that data + // into map_, we are not at the end of the input. This is unusual, but + // allowed by the spec. + bool ReadBeyondKeyValuePair(io::CodedInputStream* input) PROTOBUF_COLD { + NewEntry(); + ValueMover::Move(value_ptr_, entry_->mutable_value()); + map_->erase(key_); + KeyMover::Move(&key_, entry_->mutable_key()); + const bool result = entry_->MergePartialFromCodedStream(input); + if (result) UseKeyAndValueFromEntry(); + return result; + } + + typedef MoveHelper + KeyMover; + typedef MoveHelper + ValueMover; + + MapField* const mf_; + Map* const map_; + Key key_; + Value* value_ptr_; + MapEntryImpl* entry_ = nullptr; + }; + + protected: + void set_has_key() { _has_bits_[0] |= 0x00000001u; } + bool has_key() const { return (_has_bits_[0] & 0x00000001u) != 0; } + void clear_has_key() { _has_bits_[0] &= ~0x00000001u; } + void set_has_value() { _has_bits_[0] |= 0x00000002u; } + bool has_value() const { return (_has_bits_[0] & 0x00000002u) != 0; } + void clear_has_value() { _has_bits_[0] &= ~0x00000002u; } + + public: + inline Arena* GetArena() const { return Base::GetArena(); } + + public: // Needed for constructing tables + KeyOnMemory key_; + ValueOnMemory value_; + uint32 _has_bits_[1]; + + private: + friend class ::PROTOBUF_NAMESPACE_ID::Arena; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + template + friend class internal::MapEntry; + template + friend class internal::MapFieldLite; + + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MapEntryImpl); +}; + +template +class MapEntryLite + : public MapEntryImpl { + public: + typedef MapEntryImpl + SuperType; + MapEntryLite() {} + explicit MapEntryLite(Arena* arena) : SuperType(arena) {} + ~MapEntryLite() { MessageLite::_internal_metadata_.Delete(); } + void MergeFrom(const MapEntryLite& other) { MergeFromInternal(other); } + + private: + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MapEntryLite); +}; +// The completely unprincipled and unwieldy use of template parameters in +// the map code necessitates wrappers to make the code a little bit more +// manageable. +template +struct DeconstructMapEntry; + +template +struct DeconstructMapEntry > { + typedef K Key; + typedef V Value; + static const WireFormatLite::FieldType kKeyFieldType = key; + static const WireFormatLite::FieldType kValueFieldType = value; + static const int default_enum_value = default_enum; +}; + +// Helpers for deterministic serialization ============================= + +// This struct can be used with any generic sorting algorithm. If the Key +// type is relatively small and easy to copy then copying Keys into an +// array of SortItems can be beneficial. Then all the data the sorting +// algorithm needs to touch is in that one array. +template +struct SortItem { + SortItem() {} + explicit SortItem(PtrToKeyValuePair p) : first(p->first), second(p) {} + + Key first; + PtrToKeyValuePair second; +}; + +template +struct CompareByFirstField { + bool operator()(const T& a, const T& b) const { return a.first < b.first; } +}; + +template +struct CompareByDerefFirst { + bool operator()(const T& a, const T& b) const { return a->first < b->first; } +}; + +// Helper for table driven serialization + +template +struct FromHelper { + template + static const T& From(const T& x) { + return x; + } +}; + +template <> +struct FromHelper { + static ArenaStringPtr From(const std::string& x) { + ArenaStringPtr res; + TaggedPtr ptr; + ptr.Set(const_cast(&x)); + res.UnsafeSetTaggedPointer(ptr); + return res; + } +}; +template <> +struct FromHelper { + static ArenaStringPtr From(const std::string& x) { + ArenaStringPtr res; + TaggedPtr ptr; + ptr.Set(const_cast(&x)); + res.UnsafeSetTaggedPointer(ptr); + return res; + } +}; +template <> +struct FromHelper { + template + static T* From(const T& x) { + return const_cast(&x); + } +}; + +template +struct MapEntryHelper; + +template +struct MapEntryHelper > { + // Provide utilities to parse/serialize key/value. Provide utilities to + // manipulate internal stored type. + typedef MapTypeHandler KeyTypeHandler; + typedef MapTypeHandler ValueTypeHandler; + + // Define internal memory layout. Strings and messages are stored as + // pointers, while other types are stored as values. + typedef typename KeyTypeHandler::TypeOnMemory KeyOnMemory; + typedef typename ValueTypeHandler::TypeOnMemory ValueOnMemory; + + explicit MapEntryHelper(const MapPair& map_pair) + : _has_bits_(3), + _cached_size_(2 + KeyTypeHandler::GetCachedSize(map_pair.first) + + ValueTypeHandler::GetCachedSize(map_pair.second)), + key_(FromHelper::From(map_pair.first)), + value_(FromHelper::From(map_pair.second)) {} + + // Purposely not following the style guide naming. These are the names + // the proto compiler would generate given the map entry descriptor. + // The proto compiler generates the offsets in this struct as if this was + // a regular message. This way the table driven code barely notices it's + // dealing with a map field. + uint32 _has_bits_; // NOLINT + uint32 _cached_size_; // NOLINT + KeyOnMemory key_; // NOLINT + ValueOnMemory value_; // NOLINT +}; + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_MAP_ENTRY_LITE_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/metadata.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/metadata.h new file mode 100644 index 0000000000000000000000000000000000000000..756936991766a4ef660ba971bc9b9172bd734b8d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/metadata.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLE_PROTOBUF_METADATA_H__ +#define GOOGLE_PROTOBUF_METADATA_H__ + +// TODO(b/151117630): Remove this file and all instances where it gets imported. + +#endif // GOOGLE_PROTOBUF_METADATA_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/port.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/port.h new file mode 100644 index 0000000000000000000000000000000000000000..cfb7d4fcdf21632bf9a342aa72d3d5c15c8c1615 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/port.h @@ -0,0 +1,48 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// A common header that is included across all protobuf headers. We do our best +// to avoid #defining any macros here; instead we generally put macros in +// port_def.inc and port_undef.inc so they are not visible from outside of +// protobuf. + +#ifndef GOOGLE_PROTOBUF_PORT_H__ +#define GOOGLE_PROTOBUF_PORT_H__ + + +#include + + +#endif // GOOGLE_PROTOBUF_PORT_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection.h new file mode 100644 index 0000000000000000000000000000000000000000..ff2da6f7f6b11af5f6fc0dce7be23fb67fc51345 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection.h @@ -0,0 +1,568 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This header defines the RepeatedFieldRef class template used to access +// repeated fields with protobuf reflection API. +#ifndef GOOGLE_PROTOBUF_REFLECTION_H__ +#define GOOGLE_PROTOBUF_REFLECTION_H__ + +#include + +#include +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +#include + +namespace google { +namespace protobuf { +namespace internal { +template +struct RefTypeTraits; +} // namespace internal + +template +RepeatedFieldRef Reflection::GetRepeatedFieldRef( + const Message& message, const FieldDescriptor* field) const { + return RepeatedFieldRef(message, field); +} + +template +MutableRepeatedFieldRef Reflection::GetMutableRepeatedFieldRef( + Message* message, const FieldDescriptor* field) const { + return MutableRepeatedFieldRef(message, field); +} + +// RepeatedFieldRef definition for non-message types. +template +class RepeatedFieldRef< + T, typename std::enable_if::value>::type> { + typedef typename internal::RefTypeTraits::iterator IteratorType; + typedef typename internal::RefTypeTraits::AccessorType AccessorType; + + public: + bool empty() const { return accessor_->IsEmpty(data_); } + int size() const { return accessor_->Size(data_); } + T Get(int index) const { return accessor_->template Get(data_, index); } + + typedef IteratorType iterator; + typedef IteratorType const_iterator; + typedef T value_type; + typedef T& reference; + typedef const T& const_reference; + typedef int size_type; + typedef ptrdiff_t difference_type; + + iterator begin() const { return iterator(data_, accessor_, true); } + iterator end() const { return iterator(data_, accessor_, false); } + + private: + friend class Reflection; + RepeatedFieldRef(const Message& message, const FieldDescriptor* field) { + const Reflection* reflection = message.GetReflection(); + data_ = reflection->RepeatedFieldData(const_cast(&message), field, + internal::RefTypeTraits::cpp_type, + NULL); + accessor_ = reflection->RepeatedFieldAccessor(field); + } + + const void* data_; + const AccessorType* accessor_; +}; + +// MutableRepeatedFieldRef definition for non-message types. +template +class MutableRepeatedFieldRef< + T, typename std::enable_if::value>::type> { + typedef typename internal::RefTypeTraits::AccessorType AccessorType; + + public: + bool empty() const { return accessor_->IsEmpty(data_); } + int size() const { return accessor_->Size(data_); } + T Get(int index) const { return accessor_->template Get(data_, index); } + + void Set(int index, const T& value) const { + accessor_->template Set(data_, index, value); + } + void Add(const T& value) const { accessor_->template Add(data_, value); } + void RemoveLast() const { accessor_->RemoveLast(data_); } + void SwapElements(int index1, int index2) const { + accessor_->SwapElements(data_, index1, index2); + } + void Clear() const { accessor_->Clear(data_); } + + void Swap(const MutableRepeatedFieldRef& other) const { + accessor_->Swap(data_, other.accessor_, other.data_); + } + + template + void MergeFrom(const Container& container) const { + typedef typename Container::const_iterator Iterator; + for (Iterator it = container.begin(); it != container.end(); ++it) { + Add(*it); + } + } + template + void CopyFrom(const Container& container) const { + Clear(); + MergeFrom(container); + } + + private: + friend class Reflection; + MutableRepeatedFieldRef(Message* message, const FieldDescriptor* field) { + const Reflection* reflection = message->GetReflection(); + data_ = reflection->RepeatedFieldData( + message, field, internal::RefTypeTraits::cpp_type, NULL); + accessor_ = reflection->RepeatedFieldAccessor(field); + } + + void* data_; + const AccessorType* accessor_; +}; + +// RepeatedFieldRef definition for message types. +template +class RepeatedFieldRef< + T, typename std::enable_if::value>::type> { + typedef typename internal::RefTypeTraits::iterator IteratorType; + typedef typename internal::RefTypeTraits::AccessorType AccessorType; + + public: + bool empty() const { return accessor_->IsEmpty(data_); } + int size() const { return accessor_->Size(data_); } + // This method returns a reference to the underlying message object if it + // exists. If a message object doesn't exist (e.g., data stored in serialized + // form), scratch_space will be filled with the data and a reference to it + // will be returned. + // + // Example: + // RepeatedFieldRef h = ... + // unique_ptr scratch_space(h.NewMessage()); + // const Message& item = h.Get(index, scratch_space.get()); + const T& Get(int index, T* scratch_space) const { + return *static_cast(accessor_->Get(data_, index, scratch_space)); + } + // Create a new message of the same type as the messages stored in this + // repeated field. Caller takes ownership of the returned object. + T* NewMessage() const { return static_cast(default_instance_->New()); } + + typedef IteratorType iterator; + typedef IteratorType const_iterator; + typedef T value_type; + typedef T& reference; + typedef const T& const_reference; + typedef int size_type; + typedef ptrdiff_t difference_type; + + iterator begin() const { + return iterator(data_, accessor_, true, NewMessage()); + } + iterator end() const { + // The end iterator must not be dereferenced, no need for scratch space. + return iterator(data_, accessor_, false, nullptr); + } + + private: + friend class Reflection; + RepeatedFieldRef(const Message& message, const FieldDescriptor* field) { + const Reflection* reflection = message.GetReflection(); + data_ = reflection->RepeatedFieldData( + const_cast(&message), field, + internal::RefTypeTraits::cpp_type, + internal::RefTypeTraits::GetMessageFieldDescriptor()); + accessor_ = reflection->RepeatedFieldAccessor(field); + default_instance_ = + reflection->GetMessageFactory()->GetPrototype(field->message_type()); + } + + const void* data_; + const AccessorType* accessor_; + const Message* default_instance_; +}; + +// MutableRepeatedFieldRef definition for message types. +template +class MutableRepeatedFieldRef< + T, typename std::enable_if::value>::type> { + typedef typename internal::RefTypeTraits::AccessorType AccessorType; + + public: + bool empty() const { return accessor_->IsEmpty(data_); } + int size() const { return accessor_->Size(data_); } + // See comments for RepeatedFieldRef::Get() + const T& Get(int index, T* scratch_space) const { + return *static_cast(accessor_->Get(data_, index, scratch_space)); + } + // Create a new message of the same type as the messages stored in this + // repeated field. Caller takes ownership of the returned object. + T* NewMessage() const { return static_cast(default_instance_->New()); } + + void Set(int index, const T& value) const { + accessor_->Set(data_, index, &value); + } + void Add(const T& value) const { accessor_->Add(data_, &value); } + void RemoveLast() const { accessor_->RemoveLast(data_); } + void SwapElements(int index1, int index2) const { + accessor_->SwapElements(data_, index1, index2); + } + void Clear() const { accessor_->Clear(data_); } + + void Swap(const MutableRepeatedFieldRef& other) const { + accessor_->Swap(data_, other.accessor_, other.data_); + } + + template + void MergeFrom(const Container& container) const { + typedef typename Container::const_iterator Iterator; + for (Iterator it = container.begin(); it != container.end(); ++it) { + Add(*it); + } + } + template + void CopyFrom(const Container& container) const { + Clear(); + MergeFrom(container); + } + + private: + friend class Reflection; + MutableRepeatedFieldRef(Message* message, const FieldDescriptor* field) { + const Reflection* reflection = message->GetReflection(); + data_ = reflection->RepeatedFieldData( + message, field, internal::RefTypeTraits::cpp_type, + internal::RefTypeTraits::GetMessageFieldDescriptor()); + accessor_ = reflection->RepeatedFieldAccessor(field); + default_instance_ = + reflection->GetMessageFactory()->GetPrototype(field->message_type()); + } + + void* data_; + const AccessorType* accessor_; + const Message* default_instance_; +}; + +namespace internal { +// Interfaces used to implement reflection RepeatedFieldRef API. +// Reflection::GetRepeatedAccessor() should return a pointer to an singleton +// object that implements the below interface. +// +// This interface passes/returns values using void pointers. The actual type +// of the value depends on the field's cpp_type. Following is a mapping from +// cpp_type to the type that should be used in this interface: +// +// field->cpp_type() T Actual type of void* +// CPPTYPE_INT32 int32 int32 +// CPPTYPE_UINT32 uint32 uint32 +// CPPTYPE_INT64 int64 int64 +// CPPTYPE_UINT64 uint64 uint64 +// CPPTYPE_DOUBLE double double +// CPPTYPE_FLOAT float float +// CPPTYPE_BOOL bool bool +// CPPTYPE_ENUM generated enum type int32 +// CPPTYPE_STRING string std::string +// CPPTYPE_MESSAGE generated message type google::protobuf::Message +// or google::protobuf::Message +// +// Note that for enums we use int32 in the interface. +// +// You can map from T to the actual type using RefTypeTraits: +// typedef RefTypeTraits::AccessorValueType ActualType; +class PROTOBUF_EXPORT RepeatedFieldAccessor { + public: + // Typedefs for clarity. + typedef void Field; + typedef void Value; + typedef void Iterator; + + virtual bool IsEmpty(const Field* data) const = 0; + virtual int Size(const Field* data) const = 0; + // Depends on the underlying representation of the repeated field, this + // method can return a pointer to the underlying object if such an object + // exists, or fill the data into scratch_space and return scratch_space. + // Callers of this method must ensure scratch_space is a valid pointer + // to a mutable object of the correct type. + virtual const Value* Get(const Field* data, int index, + Value* scratch_space) const = 0; + + virtual void Clear(Field* data) const = 0; + virtual void Set(Field* data, int index, const Value* value) const = 0; + virtual void Add(Field* data, const Value* value) const = 0; + virtual void RemoveLast(Field* data) const = 0; + virtual void SwapElements(Field* data, int index1, int index2) const = 0; + virtual void Swap(Field* data, const RepeatedFieldAccessor* other_mutator, + Field* other_data) const = 0; + + // Create an iterator that points at the beginning of the repeated field. + virtual Iterator* BeginIterator(const Field* data) const = 0; + // Create an iterator that points at the end of the repeated field. + virtual Iterator* EndIterator(const Field* data) const = 0; + // Make a copy of an iterator and return the new copy. + virtual Iterator* CopyIterator(const Field* data, + const Iterator* iterator) const = 0; + // Move an iterator to point to the next element. + virtual Iterator* AdvanceIterator(const Field* data, + Iterator* iterator) const = 0; + // Compare whether two iterators point to the same element. + virtual bool EqualsIterator(const Field* data, const Iterator* a, + const Iterator* b) const = 0; + // Delete an iterator created by BeginIterator(), EndIterator() and + // CopyIterator(). + virtual void DeleteIterator(const Field* data, Iterator* iterator) const = 0; + // Like Get() but for iterators. + virtual const Value* GetIteratorValue(const Field* data, + const Iterator* iterator, + Value* scratch_space) const = 0; + + // Templated methods that make using this interface easier for non-message + // types. + template + T Get(const Field* data, int index) const { + typedef typename RefTypeTraits::AccessorValueType ActualType; + ActualType scratch_space; + return static_cast(*reinterpret_cast( + Get(data, index, static_cast(&scratch_space)))); + } + + template + void Set(Field* data, int index, const ValueType& value) const { + typedef typename RefTypeTraits::AccessorValueType ActualType; + // In this RepeatedFieldAccessor interface we pass/return data using + // raw pointers. Type of the data these raw pointers point to should + // be ActualType. Here we have a ValueType object and want a ActualType + // pointer. We can't cast a ValueType pointer to an ActualType pointer + // directly because their type might be different (for enums ValueType + // may be a generated enum type while ActualType is int32). To be safe + // we make a copy to get a temporary ActualType object and use it. + ActualType tmp = static_cast(value); + Set(data, index, static_cast(&tmp)); + } + + template + void Add(Field* data, const ValueType& value) const { + typedef typename RefTypeTraits::AccessorValueType ActualType; + // In this RepeatedFieldAccessor interface we pass/return data using + // raw pointers. Type of the data these raw pointers point to should + // be ActualType. Here we have a ValueType object and want a ActualType + // pointer. We can't cast a ValueType pointer to an ActualType pointer + // directly because their type might be different (for enums ValueType + // may be a generated enum type while ActualType is int32). To be safe + // we make a copy to get a temporary ActualType object and use it. + ActualType tmp = static_cast(value); + Add(data, static_cast(&tmp)); + } + + protected: + // We want the destructor to be completely trivial as to allow it to be + // a function local static. Hence we make it non-virtual and protected, + // this class only live as part of a global singleton and should not be + // deleted. + ~RepeatedFieldAccessor() = default; +}; + +// Implement (Mutable)RepeatedFieldRef::iterator +template +class RepeatedFieldRefIterator + : public std::iterator { + typedef typename RefTypeTraits::AccessorValueType AccessorValueType; + typedef typename RefTypeTraits::IteratorValueType IteratorValueType; + typedef typename RefTypeTraits::IteratorPointerType IteratorPointerType; + + public: + // Constructor for non-message fields. + RepeatedFieldRefIterator(const void* data, + const RepeatedFieldAccessor* accessor, bool begin) + : data_(data), + accessor_(accessor), + iterator_(begin ? accessor->BeginIterator(data) + : accessor->EndIterator(data)), + // The end iterator must not be dereferenced, no need for scratch space. + scratch_space_(begin ? new AccessorValueType : nullptr) {} + // Constructor for message fields. + RepeatedFieldRefIterator(const void* data, + const RepeatedFieldAccessor* accessor, bool begin, + AccessorValueType* scratch_space) + : data_(data), + accessor_(accessor), + iterator_(begin ? accessor->BeginIterator(data) + : accessor->EndIterator(data)), + scratch_space_(scratch_space) {} + ~RepeatedFieldRefIterator() { accessor_->DeleteIterator(data_, iterator_); } + RepeatedFieldRefIterator operator++(int) { + RepeatedFieldRefIterator tmp(*this); + iterator_ = accessor_->AdvanceIterator(data_, iterator_); + return tmp; + } + RepeatedFieldRefIterator& operator++() { + iterator_ = accessor_->AdvanceIterator(data_, iterator_); + return *this; + } + IteratorValueType operator*() const { + return static_cast( + *static_cast(accessor_->GetIteratorValue( + data_, iterator_, scratch_space_.get()))); + } + IteratorPointerType operator->() const { + return static_cast( + accessor_->GetIteratorValue(data_, iterator_, scratch_space_.get())); + } + bool operator!=(const RepeatedFieldRefIterator& other) const { + assert(data_ == other.data_); + assert(accessor_ == other.accessor_); + return !accessor_->EqualsIterator(data_, iterator_, other.iterator_); + } + bool operator==(const RepeatedFieldRefIterator& other) const { + return !this->operator!=(other); + } + + RepeatedFieldRefIterator(const RepeatedFieldRefIterator& other) + : data_(other.data_), + accessor_(other.accessor_), + iterator_(accessor_->CopyIterator(data_, other.iterator_)) {} + RepeatedFieldRefIterator& operator=(const RepeatedFieldRefIterator& other) { + if (this != &other) { + accessor_->DeleteIterator(data_, iterator_); + data_ = other.data_; + accessor_ = other.accessor_; + iterator_ = accessor_->CopyIterator(data_, other.iterator_); + } + return *this; + } + + protected: + const void* data_; + const RepeatedFieldAccessor* accessor_; + void* iterator_; + std::unique_ptr scratch_space_; +}; + +// TypeTraits that maps the type parameter T of RepeatedFieldRef or +// MutableRepeatedFieldRef to corresponding iterator type, +// RepeatedFieldAccessor type, etc. +template +struct PrimitiveTraits { + static constexpr bool is_primitive = false; +}; +#define DEFINE_PRIMITIVE(TYPE, type) \ + template <> \ + struct PrimitiveTraits { \ + static const bool is_primitive = true; \ + static const FieldDescriptor::CppType cpp_type = \ + FieldDescriptor::CPPTYPE_##TYPE; \ + }; +DEFINE_PRIMITIVE(INT32, int32) +DEFINE_PRIMITIVE(UINT32, uint32) +DEFINE_PRIMITIVE(INT64, int64) +DEFINE_PRIMITIVE(UINT64, uint64) +DEFINE_PRIMITIVE(FLOAT, float) +DEFINE_PRIMITIVE(DOUBLE, double) +DEFINE_PRIMITIVE(BOOL, bool) +#undef DEFINE_PRIMITIVE + +template +struct RefTypeTraits< + T, typename std::enable_if::is_primitive>::type> { + typedef RepeatedFieldRefIterator iterator; + typedef RepeatedFieldAccessor AccessorType; + typedef T AccessorValueType; + typedef T IteratorValueType; + typedef T* IteratorPointerType; + static constexpr FieldDescriptor::CppType cpp_type = + PrimitiveTraits::cpp_type; + static const Descriptor* GetMessageFieldDescriptor() { return NULL; } +}; + +template +struct RefTypeTraits< + T, typename std::enable_if::value>::type> { + typedef RepeatedFieldRefIterator iterator; + typedef RepeatedFieldAccessor AccessorType; + // We use int32 for repeated enums in RepeatedFieldAccessor. + typedef int32 AccessorValueType; + typedef T IteratorValueType; + typedef int32* IteratorPointerType; + static constexpr FieldDescriptor::CppType cpp_type = + FieldDescriptor::CPPTYPE_ENUM; + static const Descriptor* GetMessageFieldDescriptor() { return NULL; } +}; + +template +struct RefTypeTraits< + T, typename std::enable_if::value>::type> { + typedef RepeatedFieldRefIterator iterator; + typedef RepeatedFieldAccessor AccessorType; + typedef std::string AccessorValueType; + typedef const std::string IteratorValueType; + typedef const std::string* IteratorPointerType; + static constexpr FieldDescriptor::CppType cpp_type = + FieldDescriptor::CPPTYPE_STRING; + static const Descriptor* GetMessageFieldDescriptor() { return NULL; } +}; + +template +struct MessageDescriptorGetter { + static const Descriptor* get() { + return T::default_instance().GetDescriptor(); + } +}; +template <> +struct MessageDescriptorGetter { + static const Descriptor* get() { return NULL; } +}; + +template +struct RefTypeTraits< + T, typename std::enable_if::value>::type> { + typedef RepeatedFieldRefIterator iterator; + typedef RepeatedFieldAccessor AccessorType; + typedef Message AccessorValueType; + typedef const T& IteratorValueType; + typedef const T* IteratorPointerType; + static constexpr FieldDescriptor::CppType cpp_type = + FieldDescriptor::CPPTYPE_MESSAGE; + static const Descriptor* GetMessageFieldDescriptor() { + return MessageDescriptorGetter::get(); + } +}; +} // namespace internal +} // namespace protobuf +} // namespace google + +#include + +#endif // GOOGLE_PROTOBUF_REFLECTION_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/struct.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/struct.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..349b293ade6e3011ec94538b2acfb0099be7a369 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/struct.pb.h @@ -0,0 +1,1178 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/struct.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fstruct_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fstruct_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +#include +#include +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fstruct_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fstruct_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[4] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fstruct_2eproto; +PROTOBUF_NAMESPACE_OPEN +class ListValue; +class ListValueDefaultTypeInternal; +PROTOBUF_EXPORT extern ListValueDefaultTypeInternal _ListValue_default_instance_; +class Struct; +class StructDefaultTypeInternal; +PROTOBUF_EXPORT extern StructDefaultTypeInternal _Struct_default_instance_; +class Struct_FieldsEntry_DoNotUse; +class Struct_FieldsEntry_DoNotUseDefaultTypeInternal; +PROTOBUF_EXPORT extern Struct_FieldsEntry_DoNotUseDefaultTypeInternal _Struct_FieldsEntry_DoNotUse_default_instance_; +class Value; +class ValueDefaultTypeInternal; +PROTOBUF_EXPORT extern ValueDefaultTypeInternal _Value_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ListValue* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Struct* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Struct_FieldsEntry_DoNotUse* Arena::CreateMaybeMessage(Arena*); +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Value* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +enum NullValue : int { + NULL_VALUE = 0, + NullValue_INT_MIN_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::PROTOBUF_NAMESPACE_ID::int32>::min(), + NullValue_INT_MAX_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::PROTOBUF_NAMESPACE_ID::int32>::max() +}; +PROTOBUF_EXPORT bool NullValue_IsValid(int value); +constexpr NullValue NullValue_MIN = NULL_VALUE; +constexpr NullValue NullValue_MAX = NULL_VALUE; +constexpr int NullValue_ARRAYSIZE = NullValue_MAX + 1; + +PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* NullValue_descriptor(); +template +inline const std::string& NullValue_Name(T enum_t_value) { + static_assert(::std::is_same::value || + ::std::is_integral::value, + "Incorrect type passed to function NullValue_Name."); + return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum( + NullValue_descriptor(), enum_t_value); +} +inline bool NullValue_Parse( + ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, NullValue* value) { + return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum( + NullValue_descriptor(), name, value); +} +// =================================================================== + +class Struct_FieldsEntry_DoNotUse : public ::PROTOBUF_NAMESPACE_ID::internal::MapEntry { +public: + typedef ::PROTOBUF_NAMESPACE_ID::internal::MapEntry SuperType; + Struct_FieldsEntry_DoNotUse(); + explicit Struct_FieldsEntry_DoNotUse(::PROTOBUF_NAMESPACE_ID::Arena* arena); + void MergeFrom(const Struct_FieldsEntry_DoNotUse& other); + static const Struct_FieldsEntry_DoNotUse* internal_default_instance() { return reinterpret_cast(&_Struct_FieldsEntry_DoNotUse_default_instance_); } + static bool ValidateKey(std::string* s) { + return ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::VerifyUtf8String(s->data(), static_cast(s->size()), ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::PARSE, "google.protobuf.Struct.FieldsEntry.key"); + } + static bool ValidateValue(void*) { return true; } + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& other) final; + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto); + return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[0]; + } + + public: +}; + +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT Struct PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Struct) */ { + public: + inline Struct() : Struct(nullptr) {} + virtual ~Struct(); + + Struct(const Struct& from); + Struct(Struct&& from) noexcept + : Struct() { + *this = ::std::move(from); + } + + inline Struct& operator=(const Struct& from) { + CopyFrom(from); + return *this; + } + inline Struct& operator=(Struct&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Struct& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Struct* internal_default_instance() { + return reinterpret_cast( + &_Struct_default_instance_); + } + static constexpr int kIndexInFileMessages = + 1; + + friend void swap(Struct& a, Struct& b) { + a.Swap(&b); + } + inline void Swap(Struct* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Struct* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Struct* New() const final { + return CreateMaybeMessage(nullptr); + } + + Struct* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Struct& from); + void MergeFrom(const Struct& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Struct* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Struct"; + } + protected: + explicit Struct(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto); + return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + + // accessors ------------------------------------------------------- + + enum : int { + kFieldsFieldNumber = 1, + }; + // map fields = 1; + int fields_size() const; + private: + int _internal_fields_size() const; + public: + void clear_fields(); + private: + const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >& + _internal_fields() const; + ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >* + _internal_mutable_fields(); + public: + const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >& + fields() const; + ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >* + mutable_fields(); + + // @@protoc_insertion_point(class_scope:google.protobuf.Struct) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::internal::MapField< + Struct_FieldsEntry_DoNotUse, + std::string, PROTOBUF_NAMESPACE_ID::Value, + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_STRING, + ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_MESSAGE, + 0 > fields_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fstruct_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT Value PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Value) */ { + public: + inline Value() : Value(nullptr) {} + virtual ~Value(); + + Value(const Value& from); + Value(Value&& from) noexcept + : Value() { + *this = ::std::move(from); + } + + inline Value& operator=(const Value& from) { + CopyFrom(from); + return *this; + } + inline Value& operator=(Value&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Value& default_instance(); + + enum KindCase { + kNullValue = 1, + kNumberValue = 2, + kStringValue = 3, + kBoolValue = 4, + kStructValue = 5, + kListValue = 6, + KIND_NOT_SET = 0, + }; + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Value* internal_default_instance() { + return reinterpret_cast( + &_Value_default_instance_); + } + static constexpr int kIndexInFileMessages = + 2; + + friend void swap(Value& a, Value& b) { + a.Swap(&b); + } + inline void Swap(Value* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Value* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Value* New() const final { + return CreateMaybeMessage(nullptr); + } + + Value* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Value& from); + void MergeFrom(const Value& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Value* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Value"; + } + protected: + explicit Value(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto); + return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kNullValueFieldNumber = 1, + kNumberValueFieldNumber = 2, + kStringValueFieldNumber = 3, + kBoolValueFieldNumber = 4, + kStructValueFieldNumber = 5, + kListValueFieldNumber = 6, + }; + // .google.protobuf.NullValue null_value = 1; + private: + bool _internal_has_null_value() const; + public: + void clear_null_value(); + PROTOBUF_NAMESPACE_ID::NullValue null_value() const; + void set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value); + private: + PROTOBUF_NAMESPACE_ID::NullValue _internal_null_value() const; + void _internal_set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value); + public: + + // double number_value = 2; + private: + bool _internal_has_number_value() const; + public: + void clear_number_value(); + double number_value() const; + void set_number_value(double value); + private: + double _internal_number_value() const; + void _internal_set_number_value(double value); + public: + + // string string_value = 3; + private: + bool _internal_has_string_value() const; + public: + void clear_string_value(); + const std::string& string_value() const; + void set_string_value(const std::string& value); + void set_string_value(std::string&& value); + void set_string_value(const char* value); + void set_string_value(const char* value, size_t size); + std::string* mutable_string_value(); + std::string* release_string_value(); + void set_allocated_string_value(std::string* string_value); + private: + const std::string& _internal_string_value() const; + void _internal_set_string_value(const std::string& value); + std::string* _internal_mutable_string_value(); + public: + + // bool bool_value = 4; + private: + bool _internal_has_bool_value() const; + public: + void clear_bool_value(); + bool bool_value() const; + void set_bool_value(bool value); + private: + bool _internal_bool_value() const; + void _internal_set_bool_value(bool value); + public: + + // .google.protobuf.Struct struct_value = 5; + bool has_struct_value() const; + private: + bool _internal_has_struct_value() const; + public: + void clear_struct_value(); + const PROTOBUF_NAMESPACE_ID::Struct& struct_value() const; + PROTOBUF_NAMESPACE_ID::Struct* release_struct_value(); + PROTOBUF_NAMESPACE_ID::Struct* mutable_struct_value(); + void set_allocated_struct_value(PROTOBUF_NAMESPACE_ID::Struct* struct_value); + private: + const PROTOBUF_NAMESPACE_ID::Struct& _internal_struct_value() const; + PROTOBUF_NAMESPACE_ID::Struct* _internal_mutable_struct_value(); + public: + void unsafe_arena_set_allocated_struct_value( + PROTOBUF_NAMESPACE_ID::Struct* struct_value); + PROTOBUF_NAMESPACE_ID::Struct* unsafe_arena_release_struct_value(); + + // .google.protobuf.ListValue list_value = 6; + bool has_list_value() const; + private: + bool _internal_has_list_value() const; + public: + void clear_list_value(); + const PROTOBUF_NAMESPACE_ID::ListValue& list_value() const; + PROTOBUF_NAMESPACE_ID::ListValue* release_list_value(); + PROTOBUF_NAMESPACE_ID::ListValue* mutable_list_value(); + void set_allocated_list_value(PROTOBUF_NAMESPACE_ID::ListValue* list_value); + private: + const PROTOBUF_NAMESPACE_ID::ListValue& _internal_list_value() const; + PROTOBUF_NAMESPACE_ID::ListValue* _internal_mutable_list_value(); + public: + void unsafe_arena_set_allocated_list_value( + PROTOBUF_NAMESPACE_ID::ListValue* list_value); + PROTOBUF_NAMESPACE_ID::ListValue* unsafe_arena_release_list_value(); + + void clear_kind(); + KindCase kind_case() const; + // @@protoc_insertion_point(class_scope:google.protobuf.Value) + private: + class _Internal; + void set_has_null_value(); + void set_has_number_value(); + void set_has_string_value(); + void set_has_bool_value(); + void set_has_struct_value(); + void set_has_list_value(); + + inline bool has_kind() const; + inline void clear_has_kind(); + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + union KindUnion { + KindUnion() {} + int null_value_; + double number_value_; + ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr string_value_; + bool bool_value_; + PROTOBUF_NAMESPACE_ID::Struct* struct_value_; + PROTOBUF_NAMESPACE_ID::ListValue* list_value_; + } kind_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + ::PROTOBUF_NAMESPACE_ID::uint32 _oneof_case_[1]; + + friend struct ::TableStruct_google_2fprotobuf_2fstruct_2eproto; +}; +// ------------------------------------------------------------------- + +class PROTOBUF_EXPORT ListValue PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ListValue) */ { + public: + inline ListValue() : ListValue(nullptr) {} + virtual ~ListValue(); + + ListValue(const ListValue& from); + ListValue(ListValue&& from) noexcept + : ListValue() { + *this = ::std::move(from); + } + + inline ListValue& operator=(const ListValue& from) { + CopyFrom(from); + return *this; + } + inline ListValue& operator=(ListValue&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const ListValue& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const ListValue* internal_default_instance() { + return reinterpret_cast( + &_ListValue_default_instance_); + } + static constexpr int kIndexInFileMessages = + 3; + + friend void swap(ListValue& a, ListValue& b) { + a.Swap(&b); + } + inline void Swap(ListValue* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(ListValue* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline ListValue* New() const final { + return CreateMaybeMessage(nullptr); + } + + ListValue* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const ListValue& from); + void MergeFrom(const ListValue& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(ListValue* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.ListValue"; + } + protected: + explicit ListValue(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto); + return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kValuesFieldNumber = 1, + }; + // repeated .google.protobuf.Value values = 1; + int values_size() const; + private: + int _internal_values_size() const; + public: + void clear_values(); + PROTOBUF_NAMESPACE_ID::Value* mutable_values(int index); + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >* + mutable_values(); + private: + const PROTOBUF_NAMESPACE_ID::Value& _internal_values(int index) const; + PROTOBUF_NAMESPACE_ID::Value* _internal_add_values(); + public: + const PROTOBUF_NAMESPACE_ID::Value& values(int index) const; + PROTOBUF_NAMESPACE_ID::Value* add_values(); + const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >& + values() const; + + // @@protoc_insertion_point(class_scope:google.protobuf.ListValue) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value > values_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2fstruct_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// ------------------------------------------------------------------- + +// Struct + +// map fields = 1; +inline int Struct::_internal_fields_size() const { + return fields_.size(); +} +inline int Struct::fields_size() const { + return _internal_fields_size(); +} +inline void Struct::clear_fields() { + fields_.Clear(); +} +inline const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >& +Struct::_internal_fields() const { + return fields_.GetMap(); +} +inline const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >& +Struct::fields() const { + // @@protoc_insertion_point(field_map:google.protobuf.Struct.fields) + return _internal_fields(); +} +inline ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >* +Struct::_internal_mutable_fields() { + return fields_.MutableMap(); +} +inline ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >* +Struct::mutable_fields() { + // @@protoc_insertion_point(field_mutable_map:google.protobuf.Struct.fields) + return _internal_mutable_fields(); +} + +// ------------------------------------------------------------------- + +// Value + +// .google.protobuf.NullValue null_value = 1; +inline bool Value::_internal_has_null_value() const { + return kind_case() == kNullValue; +} +inline void Value::set_has_null_value() { + _oneof_case_[0] = kNullValue; +} +inline void Value::clear_null_value() { + if (_internal_has_null_value()) { + kind_.null_value_ = 0; + clear_has_kind(); + } +} +inline PROTOBUF_NAMESPACE_ID::NullValue Value::_internal_null_value() const { + if (_internal_has_null_value()) { + return static_cast< PROTOBUF_NAMESPACE_ID::NullValue >(kind_.null_value_); + } + return static_cast< PROTOBUF_NAMESPACE_ID::NullValue >(0); +} +inline PROTOBUF_NAMESPACE_ID::NullValue Value::null_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.Value.null_value) + return _internal_null_value(); +} +inline void Value::_internal_set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value) { + if (!_internal_has_null_value()) { + clear_kind(); + set_has_null_value(); + } + kind_.null_value_ = value; +} +inline void Value::set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value) { + // @@protoc_insertion_point(field_set:google.protobuf.Value.null_value) + _internal_set_null_value(value); +} + +// double number_value = 2; +inline bool Value::_internal_has_number_value() const { + return kind_case() == kNumberValue; +} +inline void Value::set_has_number_value() { + _oneof_case_[0] = kNumberValue; +} +inline void Value::clear_number_value() { + if (_internal_has_number_value()) { + kind_.number_value_ = 0; + clear_has_kind(); + } +} +inline double Value::_internal_number_value() const { + if (_internal_has_number_value()) { + return kind_.number_value_; + } + return 0; +} +inline void Value::_internal_set_number_value(double value) { + if (!_internal_has_number_value()) { + clear_kind(); + set_has_number_value(); + } + kind_.number_value_ = value; +} +inline double Value::number_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.Value.number_value) + return _internal_number_value(); +} +inline void Value::set_number_value(double value) { + _internal_set_number_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.Value.number_value) +} + +// string string_value = 3; +inline bool Value::_internal_has_string_value() const { + return kind_case() == kStringValue; +} +inline void Value::set_has_string_value() { + _oneof_case_[0] = kStringValue; +} +inline void Value::clear_string_value() { + if (_internal_has_string_value()) { + kind_.string_value_.Destroy(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + clear_has_kind(); + } +} +inline const std::string& Value::string_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.Value.string_value) + return _internal_string_value(); +} +inline void Value::set_string_value(const std::string& value) { + _internal_set_string_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.Value.string_value) +} +inline std::string* Value::mutable_string_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Value.string_value) + return _internal_mutable_string_value(); +} +inline const std::string& Value::_internal_string_value() const { + if (_internal_has_string_value()) { + return kind_.string_value_.Get(); + } + return *&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(); +} +inline void Value::_internal_set_string_value(const std::string& value) { + if (!_internal_has_string_value()) { + clear_kind(); + set_has_string_value(); + kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); + } + kind_.string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena()); +} +inline void Value::set_string_value(std::string&& value) { + // @@protoc_insertion_point(field_set:google.protobuf.Value.string_value) + if (!_internal_has_string_value()) { + clear_kind(); + set_has_string_value(); + kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); + } + kind_.string_value_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena()); + // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Value.string_value) +} +inline void Value::set_string_value(const char* value) { + GOOGLE_DCHECK(value != nullptr); + if (!_internal_has_string_value()) { + clear_kind(); + set_has_string_value(); + kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); + } + kind_.string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), + ::std::string(value), GetArena()); + // @@protoc_insertion_point(field_set_char:google.protobuf.Value.string_value) +} +inline void Value::set_string_value(const char* value, + size_t size) { + if (!_internal_has_string_value()) { + clear_kind(); + set_has_string_value(); + kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); + } + kind_.string_value_.Set( + &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string( + reinterpret_cast(value), size), + GetArena()); + // @@protoc_insertion_point(field_set_pointer:google.protobuf.Value.string_value) +} +inline std::string* Value::_internal_mutable_string_value() { + if (!_internal_has_string_value()) { + clear_kind(); + set_has_string_value(); + kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited()); + } + return kind_.string_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); +} +inline std::string* Value::release_string_value() { + // @@protoc_insertion_point(field_release:google.protobuf.Value.string_value) + if (_internal_has_string_value()) { + clear_has_kind(); + return kind_.string_value_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena()); + } else { + return nullptr; + } +} +inline void Value::set_allocated_string_value(std::string* string_value) { + if (has_kind()) { + clear_kind(); + } + if (string_value != nullptr) { + set_has_string_value(); + kind_.string_value_.UnsafeSetDefault(string_value); + ::PROTOBUF_NAMESPACE_ID::Arena* arena = GetArena(); + if (arena != nullptr) { + arena->Own(string_value); + } + } + // @@protoc_insertion_point(field_set_allocated:google.protobuf.Value.string_value) +} + +// bool bool_value = 4; +inline bool Value::_internal_has_bool_value() const { + return kind_case() == kBoolValue; +} +inline void Value::set_has_bool_value() { + _oneof_case_[0] = kBoolValue; +} +inline void Value::clear_bool_value() { + if (_internal_has_bool_value()) { + kind_.bool_value_ = false; + clear_has_kind(); + } +} +inline bool Value::_internal_bool_value() const { + if (_internal_has_bool_value()) { + return kind_.bool_value_; + } + return false; +} +inline void Value::_internal_set_bool_value(bool value) { + if (!_internal_has_bool_value()) { + clear_kind(); + set_has_bool_value(); + } + kind_.bool_value_ = value; +} +inline bool Value::bool_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.Value.bool_value) + return _internal_bool_value(); +} +inline void Value::set_bool_value(bool value) { + _internal_set_bool_value(value); + // @@protoc_insertion_point(field_set:google.protobuf.Value.bool_value) +} + +// .google.protobuf.Struct struct_value = 5; +inline bool Value::_internal_has_struct_value() const { + return kind_case() == kStructValue; +} +inline bool Value::has_struct_value() const { + return _internal_has_struct_value(); +} +inline void Value::set_has_struct_value() { + _oneof_case_[0] = kStructValue; +} +inline void Value::clear_struct_value() { + if (_internal_has_struct_value()) { + if (GetArena() == nullptr) { + delete kind_.struct_value_; + } + clear_has_kind(); + } +} +inline PROTOBUF_NAMESPACE_ID::Struct* Value::release_struct_value() { + // @@protoc_insertion_point(field_release:google.protobuf.Value.struct_value) + if (_internal_has_struct_value()) { + clear_has_kind(); + PROTOBUF_NAMESPACE_ID::Struct* temp = kind_.struct_value_; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + kind_.struct_value_ = nullptr; + return temp; + } else { + return nullptr; + } +} +inline const PROTOBUF_NAMESPACE_ID::Struct& Value::_internal_struct_value() const { + return _internal_has_struct_value() + ? *kind_.struct_value_ + : *reinterpret_cast< PROTOBUF_NAMESPACE_ID::Struct*>(&PROTOBUF_NAMESPACE_ID::_Struct_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::Struct& Value::struct_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.Value.struct_value) + return _internal_struct_value(); +} +inline PROTOBUF_NAMESPACE_ID::Struct* Value::unsafe_arena_release_struct_value() { + // @@protoc_insertion_point(field_unsafe_arena_release:google.protobuf.Value.struct_value) + if (_internal_has_struct_value()) { + clear_has_kind(); + PROTOBUF_NAMESPACE_ID::Struct* temp = kind_.struct_value_; + kind_.struct_value_ = nullptr; + return temp; + } else { + return nullptr; + } +} +inline void Value::unsafe_arena_set_allocated_struct_value(PROTOBUF_NAMESPACE_ID::Struct* struct_value) { + clear_kind(); + if (struct_value) { + set_has_struct_value(); + kind_.struct_value_ = struct_value; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.Value.struct_value) +} +inline PROTOBUF_NAMESPACE_ID::Struct* Value::_internal_mutable_struct_value() { + if (!_internal_has_struct_value()) { + clear_kind(); + set_has_struct_value(); + kind_.struct_value_ = CreateMaybeMessage< PROTOBUF_NAMESPACE_ID::Struct >(GetArena()); + } + return kind_.struct_value_; +} +inline PROTOBUF_NAMESPACE_ID::Struct* Value::mutable_struct_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Value.struct_value) + return _internal_mutable_struct_value(); +} + +// .google.protobuf.ListValue list_value = 6; +inline bool Value::_internal_has_list_value() const { + return kind_case() == kListValue; +} +inline bool Value::has_list_value() const { + return _internal_has_list_value(); +} +inline void Value::set_has_list_value() { + _oneof_case_[0] = kListValue; +} +inline void Value::clear_list_value() { + if (_internal_has_list_value()) { + if (GetArena() == nullptr) { + delete kind_.list_value_; + } + clear_has_kind(); + } +} +inline PROTOBUF_NAMESPACE_ID::ListValue* Value::release_list_value() { + // @@protoc_insertion_point(field_release:google.protobuf.Value.list_value) + if (_internal_has_list_value()) { + clear_has_kind(); + PROTOBUF_NAMESPACE_ID::ListValue* temp = kind_.list_value_; + if (GetArena() != nullptr) { + temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp); + } + kind_.list_value_ = nullptr; + return temp; + } else { + return nullptr; + } +} +inline const PROTOBUF_NAMESPACE_ID::ListValue& Value::_internal_list_value() const { + return _internal_has_list_value() + ? *kind_.list_value_ + : *reinterpret_cast< PROTOBUF_NAMESPACE_ID::ListValue*>(&PROTOBUF_NAMESPACE_ID::_ListValue_default_instance_); +} +inline const PROTOBUF_NAMESPACE_ID::ListValue& Value::list_value() const { + // @@protoc_insertion_point(field_get:google.protobuf.Value.list_value) + return _internal_list_value(); +} +inline PROTOBUF_NAMESPACE_ID::ListValue* Value::unsafe_arena_release_list_value() { + // @@protoc_insertion_point(field_unsafe_arena_release:google.protobuf.Value.list_value) + if (_internal_has_list_value()) { + clear_has_kind(); + PROTOBUF_NAMESPACE_ID::ListValue* temp = kind_.list_value_; + kind_.list_value_ = nullptr; + return temp; + } else { + return nullptr; + } +} +inline void Value::unsafe_arena_set_allocated_list_value(PROTOBUF_NAMESPACE_ID::ListValue* list_value) { + clear_kind(); + if (list_value) { + set_has_list_value(); + kind_.list_value_ = list_value; + } + // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.Value.list_value) +} +inline PROTOBUF_NAMESPACE_ID::ListValue* Value::_internal_mutable_list_value() { + if (!_internal_has_list_value()) { + clear_kind(); + set_has_list_value(); + kind_.list_value_ = CreateMaybeMessage< PROTOBUF_NAMESPACE_ID::ListValue >(GetArena()); + } + return kind_.list_value_; +} +inline PROTOBUF_NAMESPACE_ID::ListValue* Value::mutable_list_value() { + // @@protoc_insertion_point(field_mutable:google.protobuf.Value.list_value) + return _internal_mutable_list_value(); +} + +inline bool Value::has_kind() const { + return kind_case() != KIND_NOT_SET; +} +inline void Value::clear_has_kind() { + _oneof_case_[0] = KIND_NOT_SET; +} +inline Value::KindCase Value::kind_case() const { + return Value::KindCase(_oneof_case_[0]); +} +// ------------------------------------------------------------------- + +// ListValue + +// repeated .google.protobuf.Value values = 1; +inline int ListValue::_internal_values_size() const { + return values_.size(); +} +inline int ListValue::values_size() const { + return _internal_values_size(); +} +inline void ListValue::clear_values() { + values_.Clear(); +} +inline PROTOBUF_NAMESPACE_ID::Value* ListValue::mutable_values(int index) { + // @@protoc_insertion_point(field_mutable:google.protobuf.ListValue.values) + return values_.Mutable(index); +} +inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >* +ListValue::mutable_values() { + // @@protoc_insertion_point(field_mutable_list:google.protobuf.ListValue.values) + return &values_; +} +inline const PROTOBUF_NAMESPACE_ID::Value& ListValue::_internal_values(int index) const { + return values_.Get(index); +} +inline const PROTOBUF_NAMESPACE_ID::Value& ListValue::values(int index) const { + // @@protoc_insertion_point(field_get:google.protobuf.ListValue.values) + return _internal_values(index); +} +inline PROTOBUF_NAMESPACE_ID::Value* ListValue::_internal_add_values() { + return values_.Add(); +} +inline PROTOBUF_NAMESPACE_ID::Value* ListValue::add_values() { + // @@protoc_insertion_point(field_add:google.protobuf.ListValue.values) + return _internal_add_values(); +} +inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >& +ListValue::values() const { + // @@protoc_insertion_point(field_list:google.protobuf.ListValue.values) + return values_; +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + +// ------------------------------------------------------------------- + + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +PROTOBUF_NAMESPACE_OPEN + +template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::NullValue> : ::std::true_type {}; +template <> +inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::NullValue>() { + return PROTOBUF_NAMESPACE_ID::NullValue_descriptor(); +} + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fstruct_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/timestamp.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/timestamp.pb.h new file mode 100644 index 0000000000000000000000000000000000000000..95f333e728d8c81d5315bb1ad4143b024ecb9c4c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/timestamp.pb.h @@ -0,0 +1,282 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: google/protobuf/timestamp.proto + +#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ftimestamp_2eproto +#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ftimestamp_2eproto + +#include +#include + +#include +#if PROTOBUF_VERSION < 3013000 +#error This file was generated by a newer version of protoc which is +#error incompatible with your Protocol Buffer headers. Please update +#error your headers. +#endif +#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION +#error This file was generated by an older version of protoc which is +#error incompatible with your Protocol Buffer headers. Please +#error regenerate this file with a newer version of protoc. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: export +#include // IWYU pragma: export +#include +// @@protoc_insertion_point(includes) +#include +#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2ftimestamp_2eproto PROTOBUF_EXPORT +PROTOBUF_NAMESPACE_OPEN +namespace internal { +class AnyMetadata; +} // namespace internal +PROTOBUF_NAMESPACE_CLOSE + +// Internal implementation detail -- do not use these members. +struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2ftimestamp_2eproto { + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1] + PROTOBUF_SECTION_VARIABLE(protodesc_cold); + static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[]; + static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[]; + static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[]; +}; +extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2ftimestamp_2eproto; +PROTOBUF_NAMESPACE_OPEN +class Timestamp; +class TimestampDefaultTypeInternal; +PROTOBUF_EXPORT extern TimestampDefaultTypeInternal _Timestamp_default_instance_; +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN +template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Timestamp* Arena::CreateMaybeMessage(Arena*); +PROTOBUF_NAMESPACE_CLOSE +PROTOBUF_NAMESPACE_OPEN + +// =================================================================== + +class PROTOBUF_EXPORT Timestamp PROTOBUF_FINAL : + public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Timestamp) */ { + public: + inline Timestamp() : Timestamp(nullptr) {} + virtual ~Timestamp(); + + Timestamp(const Timestamp& from); + Timestamp(Timestamp&& from) noexcept + : Timestamp() { + *this = ::std::move(from); + } + + inline Timestamp& operator=(const Timestamp& from) { + CopyFrom(from); + return *this; + } + inline Timestamp& operator=(Timestamp&& from) noexcept { + if (GetArena() == from.GetArena()) { + if (this != &from) InternalSwap(&from); + } else { + CopyFrom(from); + } + return *this; + } + + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() { + return GetDescriptor(); + } + static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() { + return GetMetadataStatic().descriptor; + } + static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() { + return GetMetadataStatic().reflection; + } + static const Timestamp& default_instance(); + + static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY + static inline const Timestamp* internal_default_instance() { + return reinterpret_cast( + &_Timestamp_default_instance_); + } + static constexpr int kIndexInFileMessages = + 0; + + friend void swap(Timestamp& a, Timestamp& b) { + a.Swap(&b); + } + inline void Swap(Timestamp* other) { + if (other == this) return; + if (GetArena() == other->GetArena()) { + InternalSwap(other); + } else { + ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other); + } + } + void UnsafeArenaSwap(Timestamp* other) { + if (other == this) return; + GOOGLE_DCHECK(GetArena() == other->GetArena()); + InternalSwap(other); + } + + // implements Message ---------------------------------------------- + + inline Timestamp* New() const final { + return CreateMaybeMessage(nullptr); + } + + Timestamp* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final { + return CreateMaybeMessage(arena); + } + void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final; + void CopyFrom(const Timestamp& from); + void MergeFrom(const Timestamp& from); + PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final; + bool IsInitialized() const final; + + size_t ByteSizeLong() const final; + const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final; + ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize( + ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final; + int GetCachedSize() const final { return _cached_size_.Get(); } + + private: + inline void SharedCtor(); + inline void SharedDtor(); + void SetCachedSize(int size) const final; + void InternalSwap(Timestamp* other); + friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata; + static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() { + return "google.protobuf.Timestamp"; + } + protected: + explicit Timestamp(::PROTOBUF_NAMESPACE_ID::Arena* arena); + private: + static void ArenaDtor(void* object); + inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena); + public: + + ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final; + private: + static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() { + ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2ftimestamp_2eproto); + return ::descriptor_table_google_2fprotobuf_2ftimestamp_2eproto.file_level_metadata[kIndexInFileMessages]; + } + + public: + + // nested types ---------------------------------------------------- + + // accessors ------------------------------------------------------- + + enum : int { + kSecondsFieldNumber = 1, + kNanosFieldNumber = 2, + }; + // int64 seconds = 1; + void clear_seconds(); + ::PROTOBUF_NAMESPACE_ID::int64 seconds() const; + void set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value); + private: + ::PROTOBUF_NAMESPACE_ID::int64 _internal_seconds() const; + void _internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value); + public: + + // int32 nanos = 2; + void clear_nanos(); + ::PROTOBUF_NAMESPACE_ID::int32 nanos() const; + void set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value); + private: + ::PROTOBUF_NAMESPACE_ID::int32 _internal_nanos() const; + void _internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value); + public: + + // @@protoc_insertion_point(class_scope:google.protobuf.Timestamp) + private: + class _Internal; + + template friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + ::PROTOBUF_NAMESPACE_ID::int64 seconds_; + ::PROTOBUF_NAMESPACE_ID::int32 nanos_; + mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_; + friend struct ::TableStruct_google_2fprotobuf_2ftimestamp_2eproto; +}; +// =================================================================== + + +// =================================================================== + +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ +// Timestamp + +// int64 seconds = 1; +inline void Timestamp::clear_seconds() { + seconds_ = PROTOBUF_LONGLONG(0); +} +inline ::PROTOBUF_NAMESPACE_ID::int64 Timestamp::_internal_seconds() const { + return seconds_; +} +inline ::PROTOBUF_NAMESPACE_ID::int64 Timestamp::seconds() const { + // @@protoc_insertion_point(field_get:google.protobuf.Timestamp.seconds) + return _internal_seconds(); +} +inline void Timestamp::_internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) { + + seconds_ = value; +} +inline void Timestamp::set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) { + _internal_set_seconds(value); + // @@protoc_insertion_point(field_set:google.protobuf.Timestamp.seconds) +} + +// int32 nanos = 2; +inline void Timestamp::clear_nanos() { + nanos_ = 0; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 Timestamp::_internal_nanos() const { + return nanos_; +} +inline ::PROTOBUF_NAMESPACE_ID::int32 Timestamp::nanos() const { + // @@protoc_insertion_point(field_get:google.protobuf.Timestamp.nanos) + return _internal_nanos(); +} +inline void Timestamp::_internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) { + + nanos_ = value; +} +inline void Timestamp::set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) { + _internal_set_nanos(value); + // @@protoc_insertion_point(field_set:google.protobuf.Timestamp.nanos) +} + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif // __GNUC__ + +// @@protoc_insertion_point(namespace_scope) + +PROTOBUF_NAMESPACE_CLOSE + +// @@protoc_insertion_point(global_scope) + +#include +#endif // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ftimestamp_2eproto + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/unknown_field_set.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/unknown_field_set.h new file mode 100644 index 0000000000000000000000000000000000000000..fe676b9f8dd787f2aa933bd991b29cce97e58577 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/unknown_field_set.h @@ -0,0 +1,416 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Author: kenton@google.com (Kenton Varda) +// Based on original Protocol Buffers design by +// Sanjay Ghemawat, Jeff Dean, and others. +// +// Contains classes used to keep track of unrecognized fields seen while +// parsing a protocol message. + +#ifndef GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__ +#define GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__ + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +namespace google { +namespace protobuf { +namespace internal { +class InternalMetadata; // metadata_lite.h +class WireFormat; // wire_format.h +class MessageSetFieldSkipperUsingCord; +// extension_set_heavy.cc +} // namespace internal + +class Message; // message.h +class UnknownField; // below + +// An UnknownFieldSet contains fields that were encountered while parsing a +// message but were not defined by its type. Keeping track of these can be +// useful, especially in that they may be written if the message is serialized +// again without being cleared in between. This means that software which +// simply receives messages and forwards them to other servers does not need +// to be updated every time a new field is added to the message definition. +// +// To get the UnknownFieldSet attached to any message, call +// Reflection::GetUnknownFields(). +// +// This class is necessarily tied to the protocol buffer wire format, unlike +// the Reflection interface which is independent of any serialization scheme. +class PROTOBUF_EXPORT UnknownFieldSet { + public: + UnknownFieldSet(); + ~UnknownFieldSet(); + + // Remove all fields. + inline void Clear(); + + // Remove all fields and deallocate internal data objects + void ClearAndFreeMemory(); + + // Is this set empty? + inline bool empty() const; + + // Merge the contents of some other UnknownFieldSet with this one. + void MergeFrom(const UnknownFieldSet& other); + + // Similar to above, but this function will destroy the contents of other. + void MergeFromAndDestroy(UnknownFieldSet* other); + + // Merge the contents an UnknownFieldSet with the UnknownFieldSet in + // *metadata, if there is one. If *metadata doesn't have an UnknownFieldSet + // then add one to it and make it be a copy of the first arg. + static void MergeToInternalMetadata(const UnknownFieldSet& other, + internal::InternalMetadata* metadata); + + // Swaps the contents of some other UnknownFieldSet with this one. + inline void Swap(UnknownFieldSet* x); + + // Computes (an estimate of) the total number of bytes currently used for + // storing the unknown fields in memory. Does NOT include + // sizeof(*this) in the calculation. + size_t SpaceUsedExcludingSelfLong() const; + + int SpaceUsedExcludingSelf() const { + return internal::ToIntSize(SpaceUsedExcludingSelfLong()); + } + + // Version of SpaceUsed() including sizeof(*this). + size_t SpaceUsedLong() const; + + int SpaceUsed() const { return internal::ToIntSize(SpaceUsedLong()); } + + // Returns the number of fields present in the UnknownFieldSet. + inline int field_count() const; + // Get a field in the set, where 0 <= index < field_count(). The fields + // appear in the order in which they were added. + inline const UnknownField& field(int index) const; + // Get a mutable pointer to a field in the set, where + // 0 <= index < field_count(). The fields appear in the order in which + // they were added. + inline UnknownField* mutable_field(int index); + + // Adding fields --------------------------------------------------- + + void AddVarint(int number, uint64 value); + void AddFixed32(int number, uint32 value); + void AddFixed64(int number, uint64 value); + void AddLengthDelimited(int number, const std::string& value); + std::string* AddLengthDelimited(int number); + UnknownFieldSet* AddGroup(int number); + + // Adds an unknown field from another set. + void AddField(const UnknownField& field); + + // Delete fields with indices in the range [start .. start+num-1]. + // Caution: implementation moves all fields with indices [start+num .. ]. + void DeleteSubrange(int start, int num); + + // Delete all fields with a specific field number. The order of left fields + // is preserved. + // Caution: implementation moves all fields after the first deleted field. + void DeleteByNumber(int number); + + // Parsing helpers ------------------------------------------------- + // These work exactly like the similarly-named methods of Message. + + bool MergeFromCodedStream(io::CodedInputStream* input); + bool ParseFromCodedStream(io::CodedInputStream* input); + bool ParseFromZeroCopyStream(io::ZeroCopyInputStream* input); + bool ParseFromArray(const void* data, int size); + inline bool ParseFromString(const std::string& data) { + return ParseFromArray(data.data(), static_cast(data.size())); + } + + // Merges this message's unknown field data (if any). This works whether + // the message is a lite or full proto (for legacy reasons, lite and full + // return different types for MessageType::unknown_fields()). + template + bool MergeFromMessage(const MessageType& message); + + static const UnknownFieldSet& default_instance(); + + private: + // For InternalMergeFrom + friend class UnknownField; + // Merges from other UnknownFieldSet. This method assumes, that this object + // is newly created and has no fields. + void InternalMergeFrom(const UnknownFieldSet& other); + void ClearFallback(); + + template ::value, int>::type = 0> + bool InternalMergeFromMessage(const MessageType& message) { + MergeFrom(message.GetReflection()->GetUnknownFields(message)); + return true; + } + + template ::value && + !std::is_base_of::value, + int>::type = 0> + bool InternalMergeFromMessage(const MessageType& message) { + const auto& unknown_fields = message.unknown_fields(); + io::ArrayInputStream array_stream(unknown_fields.data(), + unknown_fields.size()); + io::CodedInputStream coded_stream(&array_stream); + return MergeFromCodedStream(&coded_stream); + } + + std::vector fields_; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(UnknownFieldSet); +}; + +namespace internal { + +inline void WriteVarint(uint32 num, uint64 val, UnknownFieldSet* unknown) { + unknown->AddVarint(num, val); +} +inline void WriteLengthDelimited(uint32 num, StringPiece val, + UnknownFieldSet* unknown) { + unknown->AddLengthDelimited(num)->assign(val.data(), val.size()); +} + +PROTOBUF_EXPORT +const char* UnknownGroupParse(UnknownFieldSet* unknown, const char* ptr, + ParseContext* ctx); +PROTOBUF_EXPORT +const char* UnknownFieldParse(uint64 tag, UnknownFieldSet* unknown, + const char* ptr, ParseContext* ctx); + +} // namespace internal + +// Represents one field in an UnknownFieldSet. +class PROTOBUF_EXPORT UnknownField { + public: + enum Type { + TYPE_VARINT, + TYPE_FIXED32, + TYPE_FIXED64, + TYPE_LENGTH_DELIMITED, + TYPE_GROUP + }; + + // The field's field number, as seen on the wire. + inline int number() const; + + // The field type. + inline Type type() const; + + // Accessors ------------------------------------------------------- + // Each method works only for UnknownFields of the corresponding type. + + inline uint64 varint() const; + inline uint32 fixed32() const; + inline uint64 fixed64() const; + inline const std::string& length_delimited() const; + inline const UnknownFieldSet& group() const; + + inline void set_varint(uint64 value); + inline void set_fixed32(uint32 value); + inline void set_fixed64(uint64 value); + inline void set_length_delimited(const std::string& value); + inline std::string* mutable_length_delimited(); + inline UnknownFieldSet* mutable_group(); + + // Serialization API. + // These methods can take advantage of the underlying implementation and may + // archieve a better performance than using getters to retrieve the data and + // do the serialization yourself. + void SerializeLengthDelimitedNoTag(io::CodedOutputStream* output) const { + output->SetCur(InternalSerializeLengthDelimitedNoTag(output->Cur(), + output->EpsCopy())); + } + + inline size_t GetLengthDelimitedSize() const; + uint8* InternalSerializeLengthDelimitedNoTag( + uint8* target, io::EpsCopyOutputStream* stream) const; + + + // If this UnknownField contains a pointer, delete it. + void Delete(); + + // Make a deep copy of any pointers in this UnknownField. + void DeepCopy(const UnknownField& other); + + // Set the wire type of this UnknownField. Should only be used when this + // UnknownField is being created. + inline void SetType(Type type); + + union LengthDelimited { + std::string* string_value; + }; + + uint32 number_; + uint32 type_; + union { + uint64 varint_; + uint32 fixed32_; + uint64 fixed64_; + mutable union LengthDelimited length_delimited_; + UnknownFieldSet* group_; + } data_; +}; + +// =================================================================== +// inline implementations + +inline UnknownFieldSet::UnknownFieldSet() {} + +inline UnknownFieldSet::~UnknownFieldSet() { Clear(); } + +inline void UnknownFieldSet::ClearAndFreeMemory() { Clear(); } + +inline void UnknownFieldSet::Clear() { + if (!fields_.empty()) { + ClearFallback(); + } +} + +inline bool UnknownFieldSet::empty() const { return fields_.empty(); } + +inline void UnknownFieldSet::Swap(UnknownFieldSet* x) { + fields_.swap(x->fields_); +} + +inline int UnknownFieldSet::field_count() const { + return static_cast(fields_.size()); +} +inline const UnknownField& UnknownFieldSet::field(int index) const { + return (fields_)[static_cast(index)]; +} +inline UnknownField* UnknownFieldSet::mutable_field(int index) { + return &(fields_)[static_cast(index)]; +} + +inline void UnknownFieldSet::AddLengthDelimited(int number, + const std::string& value) { + AddLengthDelimited(number)->assign(value); +} + + + + +inline int UnknownField::number() const { return static_cast(number_); } +inline UnknownField::Type UnknownField::type() const { + return static_cast(type_); +} + +inline uint64 UnknownField::varint() const { + assert(type() == TYPE_VARINT); + return data_.varint_; +} +inline uint32 UnknownField::fixed32() const { + assert(type() == TYPE_FIXED32); + return data_.fixed32_; +} +inline uint64 UnknownField::fixed64() const { + assert(type() == TYPE_FIXED64); + return data_.fixed64_; +} +inline const std::string& UnknownField::length_delimited() const { + assert(type() == TYPE_LENGTH_DELIMITED); + return *data_.length_delimited_.string_value; +} +inline const UnknownFieldSet& UnknownField::group() const { + assert(type() == TYPE_GROUP); + return *data_.group_; +} + +inline void UnknownField::set_varint(uint64 value) { + assert(type() == TYPE_VARINT); + data_.varint_ = value; +} +inline void UnknownField::set_fixed32(uint32 value) { + assert(type() == TYPE_FIXED32); + data_.fixed32_ = value; +} +inline void UnknownField::set_fixed64(uint64 value) { + assert(type() == TYPE_FIXED64); + data_.fixed64_ = value; +} +inline void UnknownField::set_length_delimited(const std::string& value) { + assert(type() == TYPE_LENGTH_DELIMITED); + data_.length_delimited_.string_value->assign(value); +} +inline std::string* UnknownField::mutable_length_delimited() { + assert(type() == TYPE_LENGTH_DELIMITED); + return data_.length_delimited_.string_value; +} +inline UnknownFieldSet* UnknownField::mutable_group() { + assert(type() == TYPE_GROUP); + return data_.group_; +} +template +bool UnknownFieldSet::MergeFromMessage(const MessageType& message) { + // SFINAE will route to the right version. + return InternalMergeFromMessage(message); +} + + +inline size_t UnknownField::GetLengthDelimitedSize() const { + GOOGLE_DCHECK_EQ(TYPE_LENGTH_DELIMITED, type()); + return data_.length_delimited_.string_value->size(); +} + +inline void UnknownField::SetType(Type type) { + type_ = type; +} + + +} // namespace protobuf +} // namespace google + +#include +#endif // GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5104066e5acbb8a8792cfa92f219f03f8648724 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfdf49b2893e65654786213423f9e81001342e97 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd34030a87233f03fc18610dcc5c7afafff72f70 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..946ff44e42202ded5f49ba32270f31ec59d82de9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb22bd35378a4effd105932791e050ff09c846ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/configuration_align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/configuration_align.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c259fb33d77887788cd82cb9f2c4bac766fc1ab7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/configuration_align.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/modeling_align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/modeling_align.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5183d063f6581189577fea33420073c554b51da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/modeling_align.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/processing_align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/processing_align.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3beb50940016d119f1606898e3c0efce8681b68b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/processing_align.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..124478739f61fe4b2e3620adc75f13c5afe36587 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/processing_altclip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/processing_altclip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88e15cf50dc6dc8ae4b1b19ed6623899be34dd73 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/processing_altclip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc90b3c9f905ef221430dc646bbbbff46b474262 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee06407c1dd6128cbf2528c51e04e534b9d617eb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad0085cef93a3b64a47ee114d76317edf6d4415c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..deacc9538cf55b1a94fa67741b79f5904bdc6833 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79cc66dbd420a23db279478f0aeae4f230bf82cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9332d2a44a6b78fcf989a02a5f4faf9f09e2b9b4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a142383651b5f5b95746bea9f82fa7c42cf7652 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba720015a1c761daa1c18a9bdcb76784ce7e720e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..709d0be6c74a10a1eaef56b1168dabb3008a8ff3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..100cc5c6c41009fcb0e4ba9a4f526761d912c25d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcb6197dad26e6f4dee10188ff7e0be3df19448b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1713498288eb168034b6274fc54e0dd5a6e86524 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c75c6152f0e3c3b71cd9f20183469c23324fc9a4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..318812012bc51709e983a3e423fc939cdc9a2ba9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e15e6ac1e94e7b894e096e197c75bf05f00907ea Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/configuration_autoformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/configuration_autoformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88cff3d1864f3729489bb16a58ed98612d55f4c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/configuration_autoformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c047eb12c6d24e3d1af407ad450eaa425c40036 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/configuration_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/configuration_bark.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1d9d1d6d6958fb3cbddcb83d503248b5911be77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/configuration_bark.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/generation_configuration_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/generation_configuration_bark.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa727af000416cf656d074157929400db7f0c7b4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/generation_configuration_bark.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/modeling_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/modeling_bark.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79f0dc17e1c0d59afa3867b2df0cece63b20b62b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/modeling_bark.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/processing_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/processing_bark.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56f3dd054e729a8675e9ae8171467cc33639b7cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/processing_bark.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b6759566c9a61c2064ff95bb340c7011fe5e6ae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/configuration_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/configuration_beit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ea247c01c3ed71972b5a0137cd1cce9460dbd02 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/configuration_beit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/feature_extraction_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/feature_extraction_beit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a6325e616e28ea07dc98176d42a2c8521f6d0b8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/feature_extraction_beit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a246ea0e0e768ab28fd145bc789525d7d91f63d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01b4c1cf2d9098c1304c5616d6eff03eed6d6426 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_beit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6350c942559de86d482b1656099812582274a6b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_beit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_flax_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_flax_beit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64b344a532adbd86af8624d3e0cb414f27adf1f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_flax_beit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0921587deb0f9f3790adeb655aea06da24443bd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/configuration_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/configuration_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e7e8e6555522a39d6d9e52fbd03c9535c0ed504 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/configuration_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_flax_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_flax_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ba6df18a425eaba9eec9377855ce9ab0d71d37e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_flax_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_tf_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_tf_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03fcc508e4ca96a9f4fcd576023b298d24087be4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_tf_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6957ad4b6fea47fb10f2947c7990cd5b462527f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_tf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..696c020e07597039947856eeffa81dccc0568a37 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_tf.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8d7e6b9e48f4cf94e72aecce2d5729a965aee17 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/configuration_bert_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/configuration_bert_generation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcff567016ed93845a983d99c48bb638d6c2bae6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/configuration_bert_generation.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/modeling_bert_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/modeling_bert_generation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8989002978aa3a3fdc09ad32bbcd165b26361708 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/modeling_bert_generation.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/tokenization_bert_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/tokenization_bert_generation.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5e323823e6653e8404d9b47899715bb237bdc5a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/tokenization_bert_generation.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..87419e69e5c7f0572076832237e10f236186ec33 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_big_bird import * + from .modeling_big_bird import * + from .modeling_flax_big_bird import * + from .tokenization_big_bird import * + from .tokenization_big_bird_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..8e29439bc4bd54f06689b7a3e93008baf93427ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py @@ -0,0 +1,176 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BigBird model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BigBirdConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BigBirdModel`]. It is used to instantiate an + BigBird model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the BigBird + [google/bigbird-roberta-base](https://huggingface.co/google/bigbird-roberta-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50358): + Vocabulary size of the BigBird model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BigBirdModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 1024 or 2048 or 4096). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`BigBirdModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + attention_type (`str`, *optional*, defaults to `"block_sparse"`) + Whether to use block sparse attention (with n complexity) as introduced in paper or original attention + layer (with n^2 complexity). Possible values are `"original_full"` and `"block_sparse"`. + use_bias (`bool`, *optional*, defaults to `True`) + Whether to use bias in query, key, value. + rescale_embeddings (`bool`, *optional*, defaults to `False`) + Whether to rescale embeddings with (hidden_size ** 0.5). + block_size (`int`, *optional*, defaults to 64) + Size of each block. Useful only when `attention_type == "block_sparse"`. + num_random_blocks (`int`, *optional*, defaults to 3) + Each query is going to attend these many number of random blocks. Useful only when `attention_type == + "block_sparse"`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import BigBirdConfig, BigBirdModel + + >>> # Initializing a BigBird google/bigbird-roberta-base style configuration + >>> configuration = BigBirdConfig() + + >>> # Initializing a model (with random weights) from the google/bigbird-roberta-base style configuration + >>> model = BigBirdModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "big_bird" + + def __init__( + self, + vocab_size=50358, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu_new", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=4096, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + sep_token_id=66, + attention_type="block_sparse", + use_bias=True, + rescale_embeddings=False, + block_size=64, + num_random_blocks=3, + classifier_dropout=None, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + + self.rescale_embeddings = rescale_embeddings + self.attention_type = attention_type + self.use_bias = use_bias + self.block_size = block_size + self.num_random_blocks = num_random_blocks + self.classifier_dropout = classifier_dropout + + +class BigBirdOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["BigBirdConfig", "BigBirdOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..eb89d9872be8816c83158faa4ac81c2238c634d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py @@ -0,0 +1,2957 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BigBird model.""" + +import math +import os +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward +from ...utils import ModelOutput, auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_big_bird import BigBirdConfig + + +logger = logging.get_logger(__name__) + + +_TRIVIA_QA_MAPPING = { + "big_bird_attention": "attention/self", + "output_layer_norm": "output/LayerNorm", + "attention_output": "attention/output/dense", + "output": "output/dense", + "self_attention_layer_norm": "attention/output/LayerNorm", + "intermediate": "intermediate/dense", + "word_embeddings": "bert/embeddings/word_embeddings", + "position_embedding": "bert/embeddings/position_embeddings", + "type_embeddings": "bert/embeddings/token_type_embeddings", + "embeddings": "bert/embeddings", + "layer_normalization": "output/LayerNorm", + "layer_norm": "LayerNorm", + "trivia_qa_head": "qa_classifier", + "dense": "intermediate/dense", + "dense_1": "qa_outputs", +} + + +def load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=False): + """Load tf checkpoints in a pytorch model.""" + + def load_tf_weights_bert(init_vars, tf_path): + names = [] + tf_weights = {} + + for name, shape in init_vars: + array = tf.train.load_variable(tf_path, name) + name = name.replace("bert/encoder/LayerNorm", "bert/embeddings/LayerNorm") + logger.info(f"Loading TF weight {name} with shape {shape}") + names.append(name) + tf_weights[name] = array + + return names, tf_weights + + def load_tf_weights_trivia_qa(init_vars): + names = [] + tf_weights = {} + + for i, var in enumerate(init_vars): + name_items = var.name.split("/") + + if "transformer_scaffold" in name_items[0]: + layer_name_items = name_items[0].split("_") + if len(layer_name_items) < 3: + layer_name_items += [0] + + name_items[0] = f"bert/encoder/layer_{layer_name_items[2]}" + + name = "/".join([_TRIVIA_QA_MAPPING.get(x, x) for x in name_items])[:-2] # remove last :0 in variable + + if "self/attention/output" in name: + name = name.replace("self/attention/output", "output") + + if i >= len(init_vars) - 2: + name = name.replace("intermediate", "output") + + logger.info(f"Loading TF weight {name} with shape {var.shape}") + array = var.value().numpy() + names.append(name) + tf_weights[name] = array + + return names, tf_weights + + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + + # Load weights from TF model + init_vars = tf.saved_model.load(tf_path).variables if is_trivia_qa else tf.train.list_variables(tf_path) + + if len(init_vars) <= 0: + raise ValueError("Loaded trained variables cannot be empty.") + + pt_names = list(model.state_dict().keys()) + + if is_trivia_qa: + names, tf_weights = load_tf_weights_trivia_qa(init_vars) + else: + names, tf_weights = load_tf_weights_bert(init_vars, tf_path) + + for txt_name in names: + array = tf_weights[txt_name] + name = txt_name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + pt_name = [] + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + pt_name.append("bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + pt_name.append("classifier") + elif scope_names[0] == "transform": + pointer = getattr(pointer, "transform") + pt_name.append("transform") + if ("bias" in name) or ("kernel" in name): + pointer = getattr(pointer, "dense") + pt_name.append("dense") + elif ("beta" in name) or ("gamma" in name): + pointer = getattr(pointer, "LayerNorm") + pt_name.append("LayerNorm") + else: + try: + pointer = getattr(pointer, scope_names[0]) + pt_name.append(f"{scope_names[0]}") + except AttributeError: + logger.info(f"Skipping {m_name}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + pt_name.append(f"{num}") + if m_name[-11:] == "_embeddings" or m_name == "embeddings": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if len(array.shape) > len(pointer.shape) and math.prod(array.shape) == math.prod(pointer.shape): + # print(txt_name, array.shape) + if ( + txt_name.endswith("attention/self/key/kernel") + or txt_name.endswith("attention/self/query/kernel") + or txt_name.endswith("attention/self/value/kernel") + ): + array = array.transpose(1, 0, 2).reshape(pointer.shape) + elif txt_name.endswith("attention/output/dense/kernel"): + array = array.transpose(0, 2, 1).reshape(pointer.shape) + else: + array = array.reshape(pointer.shape) + + if pointer.shape != array.shape: + raise ValueError( + f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched of {txt_name}." + ) + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + pt_weight_name = ".".join(pt_name) + logger.info(f"Initialize PyTorch weight {pt_weight_name} from {txt_name}.") + pointer.data = torch.from_numpy(array) + tf_weights.pop(txt_name, None) + pt_names.remove(pt_weight_name) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.") + logger.info(f"Weights not initialized in PyTorch model: {', '.join(pt_names)}.") + return model + + +class BigBirdEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + # End copy + + self.rescale_embeddings = config.rescale_embeddings + self.hidden_size = config.hidden_size + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.rescale_embeddings: + inputs_embeds = inputs_embeds * (self.hidden_size**0.5) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + embeddings = self.dropout(embeddings) + embeddings = self.LayerNorm(embeddings) + return embeddings + + +class BigBirdSelfAttention(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.is_decoder = config.is_decoder + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + cache_position=None, + ): + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + is_cross_attention = encoder_hidden_states is not None + current_states = encoder_hidden_states if is_cross_attention else hidden_states + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + if is_cross_attention and past_key_values is not None and past_key_values.get_seq_length(self.layer_idx) > 0: + # reuse k,v, cross_attentions + key_layer = past_key_values.layers[self.layer_idx].keys + value_layer = past_key_values.layers[self.layer_idx].values + else: + key_layer = ( + self.key(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + key_layer, value_layer = past_key_values.update( + key_layer, + value_layer, + self.layer_idx, + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BigBirdModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer, attention_probs + + +class BigBirdBlockSparseAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + + self.max_seqlen = config.max_position_embeddings + self.seed = seed + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.num_random_blocks = config.num_random_blocks + self.block_size = config.block_size + + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + def forward( + self, + hidden_states, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + output_attentions=None, + ): + # Currently this `class` can't be used in decoder. + + batch_size, seqlen, _ = hidden_states.size() + to_seq_length = from_seq_length = seqlen + from_block_size = to_block_size = self.block_size + + if from_seq_length % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_length % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + context_layer, attention_probs = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + self.num_attention_heads, + self.num_random_blocks, + self.attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_length, + to_seq_length, + seed=self.seed, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) + return context_layer, attention_probs + + @staticmethod + def torch_bmm_nd(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication""" + # faster replacement of torch.einsum ("bhqk,bhkd->bhqd") + return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( + inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1]) + ) + + @staticmethod + def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication with transpose""" + # faster replacement of torch.einsum (bhqd,bhkd->bhqk) + return torch.bmm( + inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) + ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + n_rand_blocks, + attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_len, + to_seq_len, + seed, + plan_from_length, + plan_num_rand_blocks, + output_attentions, + ): + # BigBird block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of shifting tokens (for calculating sliding attention) + # hence following code can be divided into 5 parts. + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rsqrt_d = 1 / math.sqrt(attention_head_size) + bsz = batch_size + attn_mask_penalty = -10000.0 + + # generate random attention and corresponding masks + np.random.seed(seed) + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + rand_attn = [ + self._bigbird_block_rand_mask( + self.max_seqlen, self.max_seqlen, from_block_size, to_block_size, n_rand_blocks, last_idx=1024 + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + ) + + rand_attn = np.stack(rand_attn, axis=0) + rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) + rand_attn.unsqueeze_(0) + rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + # preparing block for randn attn + gathered_key = self.torch_gather_b2(blocked_key_matrix, rand_attn) + gathered_key = gathered_key.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + gathered_value = self.torch_gather_b2(blocked_value_matrix, rand_attn) + gathered_value = gathered_value.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = nn.functional.softmax( + first_product, dim=-1 + ) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4) + first_context_layer.unsqueeze_(2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) + second_seq_pad = torch.cat( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, 0], + ], + dim=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = nn.functional.softmax( + second_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = self.torch_bmm_nd(second_attn_weights, second_value_mat, ndim=4) + + second_context_layer.unsqueeze_(2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = torch.cat( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = torch.cat( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + dim=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, exp_blocked_key_matrix, ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + first_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + last_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty + last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = torch.cat( + [first_band_product, inner_band_product, rand_band_product, last_band_product], dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = nn.functional.softmax( + band_product, dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = self.torch_bmm_nd( + attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += self.torch_bmm_nd( + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) + second_last_seq_pad = torch.cat( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_last_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, -1], + ], + dim=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = nn.functional.softmax( + second_last_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = self.torch_bmm_nd(second_last_attn_weights, second_last_value_mat, ndim=4) + second_last_context_layer.unsqueeze_(2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4) + last_context_layer.unsqueeze_(2) + + # combining representations of all tokens + context_layer = torch.cat( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + dim=2, + ) + context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask + context_layer = torch.transpose(context_layer, 1, 2) + + # this is just for visualizing; forward pass doesn't depend on following code + if output_attentions: + # TODO(PVP): need to verify if below code is correct + attention_probs = torch.zeros( + bsz, n_heads, from_seq_len, to_seq_len, dtype=torch.float, device=context_layer.device + ) + + # 1st query block + # corresponding to `first_context_layer` + attention_probs[:, :, :from_block_size, :] = first_attn_weights # all keys global + + # 2nd query block + # corresponding to `second_context_layer` + attention_probs[:, :, from_block_size : 2 * from_block_size, : 3 * to_block_size] = second_attn_weights[ + :, :, :, : 3 * to_block_size + ] # 1st three key blocks (global + sliding) + attention_probs[:, :, from_block_size : 2 * from_block_size, -to_block_size:] = second_attn_weights[ + :, :, :, 3 * to_block_size : 4 * to_block_size + ] # last key block (global) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, 1, :, i2[0]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Middle query blocks + # corresponding to `context_layer` + # sliding keys + for q_idx in range(from_seq_len // from_block_size - 4): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + )[:, :, 2:-2, :, 1:-1, :] + right_slice = attn_weights[:, :, q_idx, :, to_block_size : 4 * to_block_size] + attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view( + bsz, n_heads, from_block_size, 3, to_block_size + ) # inner_band_product + # global keys (corresponding to 1st key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[ + :, :, :, :, :to_block_size + ].view(bsz, n_heads, -1, to_block_size) # first_band_product + # global keys (corresponding to last key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[ + :, :, :, :, -to_block_size: + ].view(bsz, n_heads, -1, to_block_size) # last_band_product + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + for q_idx in range(1, len(i2) - 1): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[q_idx - 1, :, 4 * to_block_size : -to_block_size] + attn_probs_view[p1, p2, q_idx + 1, :, i2[q_idx]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Second-last query block + # corresponding to `second_last_context_layer` + attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[ + :, :, :, :to_block_size + ] # 1st key block (global) + attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = ( + second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size] + ) # last three blocks (global + sliding) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, -2, :, i2[-1]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # last query block + # corresponding to `last_context_layer` + attention_probs[:, :, -from_block_size:, :] = last_attn_weights # all keys global + + else: + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def torch_gather_b2(params, indices): + # this operation is equivalent to tf.gather when batch_dims=2 + + if params.shape[:2] != indices.shape[:2]: + raise ValueError( + "Make sure that the first two dimensions of params and indices are identical, but" + f" they are params: {params.shape[:2]} vs. indices: {indices.shape[:2]}" + ) + num_indices_to_gather = indices.shape[-2] * indices.shape[-1] + num_indices_to_pick_from = params.shape[2] + + shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) + indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from + + flattened_indices = indices.view(-1) + indices_shift + flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) + + out_flattened = flattened_params.index_select(0, flattened_indices) + + out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) + return out + + @staticmethod + def _create_rand_mask_from_inputs( + from_blocked_mask, + to_blocked_mask, + rand_attn, + num_attention_heads, + num_rand_blocks, + batch_size, + from_seq_length, + from_block_size, + ): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + rand_attn: [batch_size, num_attention_heads, + from_seq_length//from_block_size-2, num_rand_blocks] + num_attention_heads: int. Number of attention heads. + num_rand_blocks: int. Number of random chunks per row. + batch_size: int. Batch size for computation. + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + + Returns: + float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2, + from_block_size, num_rand_blocks*to_block_size]. + """ + num_windows = from_seq_length // from_block_size - 2 + rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) + rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) + rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + def _bigbird_block_rand_mask( + self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32) + # During inference (eval) no randomness + if not self.training: + return rand_attn + middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r] + elif i == 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r] + elif i == from_seq_length // from_block_size - 3: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + elif (end + 1) == last: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + else: + rand_attn[i - 1, :] = np.random.permutation( + np.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + )[:r] + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are chosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = np.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32) + for i in range(num_heads) + ] + # During inference (eval) no randomness + if not self.training: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start from plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32) + # permute the blocks + perm_block = np.random.permutation(to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blocks = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: + break + return np.array(selected_random_blocks, dtype=np.int32) + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird +class BigBirdSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BigBirdAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + self.attention_type = config.attention_type + self.config = config + self.seed = seed + + if self.config.attention_type == "original_full": + self.self = BigBirdSelfAttention(config, layer_idx=seed) + elif self.config.attention_type == "block_sparse": + self.self = BigBirdBlockSparseAttention(config, seed) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.config.attention_type}" + ) + + self.output = BigBirdSelfOutput(config) + + def set_attention_type(self, value: str, layer_idx=None): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + + self.attention_type = value + if value == "original_full": + # copy all weights to new full attention class + attn_weights = BigBirdSelfAttention(self.config, layer_idx=layer_idx) + else: + # copy all weights to new sparse attention class + attn_weights = BigBirdBlockSparseAttention(self.config, self.seed) + + attn_weights.query = self.self.query + attn_weights.value = self.self.value + attn_weights.key = self.self.key + self.self = attn_weights + if not self.training: + self.self.eval() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + # block_sparse config + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + cache_position=None, + ): + # fp16 compatibility + if band_mask is not None: + band_mask = band_mask.to(hidden_states.dtype) + if from_mask is not None: + from_mask = from_mask.to(hidden_states.dtype) + if to_mask is not None: + to_mask = to_mask.to(hidden_states.dtype) + if self.attention_type == "original_full": + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + else: + if encoder_hidden_states is not None: + raise ValueError("BigBird cannot be used as a decoder when config.attention_type != 'original_full'") + self_outputs = self.self( + hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions + ) + + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BigBird +class BigBirdIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BigBird +class BigBirdOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BigBirdLayer(GradientCheckpointingLayer): + def __init__(self, config, seed=None): + super().__init__() + self.config = config + self.attention_type = config.attention_type + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BigBirdAttention(config, seed=seed) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise TypeError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BigBirdAttention(config, seed=seed) + self.intermediate = BigBirdIntermediate(config) + self.output = BigBirdOutput(config) + + def set_attention_type(self, value: str, layer_idx=None): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.attention.set_attention_type(value, layer_idx=layer_idx) + + if self.add_cross_attention: + self.crossattention.set_attention_type(value, layer_idx=layer_idx) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + band_mask=None, + from_mask=None, + to_mask=None, + blocked_encoder_mask=None, + past_key_values=None, + output_attentions=False, + cache_position=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + from_blocked_mask=blocked_encoder_mask, + to_blocked_mask=blocked_encoder_mask, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + " cross-attention layers by setting `config.add_cross_attention=True`" + ) + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + + return (layer_output,) + outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BigBirdEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.attention_type = config.attention_type + + self.layer = nn.ModuleList( + [BigBirdLayer(config, seed=layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + for i, layer in enumerate(self.layer): + layer.set_attention_type(value, layer_idx=i) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + band_mask=None, + from_mask=None, + to_mask=None, + blocked_encoder_mask=None, + return_dict=True, + cache_position=None, + ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `DynamicCache` instead, e.g. " + "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + past_key_values, + output_attentions, + cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BigBird +class BigBirdPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BigBird +class BigBirdLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BigBirdPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BigBird +class BigBirdOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BigBirdLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->BigBird +class BigBirdOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->BigBird +class BigBirdPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BigBirdLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +@auto_docstring +class BigBirdPreTrainedModel(PreTrainedModel): + config: BigBirdConfig + load_tf_weights = load_tf_weights_in_big_bird + base_model_prefix = "bert" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, BigBirdLMPredictionHead): + module.bias.data.zero_() + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`BigBirdForPreTraining`]. + """ +) +class BigBirdForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: Optional[torch.FloatTensor] = None + seq_relationship_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of question answering models. + """ +) +class BigBirdForQuestionAnsweringModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`): + pooler output from BigBigModel + """ + + loss: Optional[torch.FloatTensor] = None + start_logits: Optional[torch.FloatTensor] = None + end_logits: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring +class BigBirdModel(BigBirdPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.attention_type = self.config.attention_type + self.config = config + + self.block_size = self.config.block_size + + self.embeddings = BigBirdEmbeddings(config) + self.encoder = BigBirdEncoder(config) + + if add_pooling_layer: + self.pooler = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + else: + self.pooler = None + self.activation = None + + if self.attention_type != "original_full" and config.add_cross_attention: + logger.warning( + "When using `BigBirdForCausalLM` as decoder, then `attention_type` must be `original_full`. Setting" + " `attention_type=original_full`" + ) + self.set_attention_type("original_full") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.encoder.set_attention_type(value) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, tuple[torch.FloatTensor]]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # in order to use block_sparse attention, sequence_length has to be at least + # bigger than all global attentions: 2 * block_size + # + sliding tokens: 3 * block_size + # + random tokens: 2 * num_random_blocks * block_size + max_tokens_to_attend = (5 + 2 * self.config.num_random_blocks) * self.config.block_size + if self.attention_type == "block_sparse" and seq_length <= max_tokens_to_attend: + # change attention_type from block_sparse to original_full + sequence_length = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1) + logger.warning( + "Attention type 'block_sparse' is not possible if sequence_length: " + f"{sequence_length} <= num global tokens: 2 * config.block_size " + "+ min. num sliding tokens: 3 * config.block_size " + "+ config.num_random_blocks * config.block_size " + "+ additional buffer: config.num_random_blocks * config.block_size " + f"= {max_tokens_to_attend} with config.block_size " + f"= {self.config.block_size}, config.num_random_blocks " + f"= {self.config.num_random_blocks}. " + "Changing attention type to 'original_full'..." + ) + self.set_attention_type("original_full") + + if self.attention_type == "block_sparse": + ( + padding_len, + input_ids, + attention_mask, + token_type_ids, + position_ids, + inputs_embeds, + ) = self._pad_to_block_size( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + pad_token_id=self.config.pad_token_id, + ) + else: + padding_len = 0 + + if self.attention_type == "block_sparse": + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.block_size + ) + extended_attention_mask = None + + elif self.attention_type == "original_full": + blocked_encoder_mask = None + band_mask = None + from_mask = None + to_mask = None + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.attention_type}" + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + blocked_encoder_mask=blocked_encoder_mask, + return_dict=return_dict, + cache_position=cache_position, + ) + sequence_output = encoder_outputs[0] + + pooler_output = self.activation(self.pooler(sequence_output[:, 0, :])) if (self.pooler is not None) else None + + # undo padding + if padding_len > 0: + # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) + sequence_output = sequence_output[:, :-padding_len] + + if not return_dict: + return (sequence_output, pooler_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooler_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + @staticmethod + def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int): + batch_size, seq_length = attention_mask.size() + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = torch.cat( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 + ) + band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask.unsqueeze_(1) + return band_mask + + blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.view(batch_size, 1, seq_length, 1) + to_mask = attention_mask.view(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def _pad_to_block_size( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + token_type_ids: torch.Tensor, + position_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + pad_token_id: int, + ): + """A helper function to pad tokens and mask to work with implementation of BigBird block-sparse attention.""" + # padding + block_size = self.config.block_size + + input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape + batch_size, seq_len = input_shape[:2] + + padding_len = (block_size - seq_len % block_size) % block_size + if padding_len > 0: + logger.warning_once( + f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of " + f"`config.block_size`: {block_size}" + ) + if input_ids is not None: + input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id) + if position_ids is not None: + # pad with position_id = pad_token_id as in modeling_bigbird.BigBirdEmbeddings + position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id) + if inputs_embeds is not None: + input_ids_padding = inputs_embeds.new_full( + (batch_size, padding_len), + self.config.pad_token_id, + dtype=torch.long, + ) + inputs_embeds_padding = self.embeddings(input_ids_padding) + inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) + + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=False + ) # no attention on the padding tokens + token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 + + return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds + + +class BigBirdForPreTraining(BigBirdPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BigBirdModel(config, add_pooling_layer=True) + self.cls = BigBirdPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.FloatTensor] = None, + next_sentence_label: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BigBirdForPreTrainingOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. If specified, nsp loss will be + added to masked_lm loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in + `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BigBirdForPreTraining + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + total_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if next_sentence_label is not None and total_loss is not None: + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = total_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BigBirdForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class BigBirdForMaskedLM(BigBirdPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `BigBirdForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = BigBirdModel(config) + self.cls = BigBirdOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForMaskedLM + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> # select random long article + >>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"] + >>> # select random sentence + >>> LONG_ARTICLE_TARGET[332:398] + 'the highest values are very close to the theoretical maximum value' + + >>> # add mask_token + >>> LONG_ARTICLE_TO_MASK = LONG_ARTICLE_TARGET.replace("maximum", "[MASK]") + >>> inputs = tokenizer(LONG_ARTICLE_TO_MASK, return_tensors="pt") + >>> # long article input + >>> list(inputs["input_ids"].shape) + [1, 919] + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + >>> # retrieve index of [MASK] + >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] + >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) + >>> tokenizer.decode(predicted_token_id) + 'maximum' + ``` + + ```python + >>> labels = tokenizer(LONG_ARTICLE_TARGET, return_tensors="pt")["input_ids"] + >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) + >>> outputs = model(**inputs, labels=labels) + >>> round(outputs.loss.item(), 2) + 1.99 + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@auto_docstring( + custom_intro=""" + BigBird Model with a `language modeling` head on top for CLM fine-tuning. + """ +) +class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `BigBirdForCausalLM` as a standalone, add `is_decoder=True.`") + + self.bert = BigBirdModel(config) + self.cls = BigBirdOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[CausalLMOutputWithCrossAttentions, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + **kwargs, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +class BigBirdClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + self.config = config + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@auto_docstring( + custom_intro=""" + BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class BigBirdForSequenceClassification(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BigBirdModel(config) + self.classifier = BigBirdClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForSequenceClassification + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli") + >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> LONG_ARTICLE = squad_ds[81514]["context"] + >>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt") + >>> # long input article + >>> list(inputs["input_ids"].shape) + [1, 919] + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + >>> predicted_class_id = logits.argmax().item() + >>> model.config.id2label[predicted_class_id] + 'LABEL_0' + ``` + + ```python + >>> num_labels = len(model.config.id2label) + >>> model = BigBirdForSequenceClassification.from_pretrained( + ... "l-yohai/bigbird-roberta-base-mnli", num_labels=num_labels + ... ) + >>> labels = torch.tensor(1) + >>> loss = model(**inputs, labels=labels).loss + >>> round(loss.item(), 2) + 1.13 + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class BigBirdForMultipleChoice(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BigBirdModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class BigBirdForTokenClassification(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BigBirdModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class BigBirdForQuestionAnsweringHead(nn.Module): + """Head for question answering tasks.""" + + def __init__(self, config): + super().__init__() + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.intermediate = BigBirdIntermediate(config) + self.output = BigBirdOutput(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, encoder_output): + hidden_states = self.dropout(encoder_output) + hidden_states = self.intermediate(hidden_states) + hidden_states = self.output(hidden_states, encoder_output) + hidden_states = self.qa_outputs(hidden_states) + return hidden_states + + +@auto_docstring +class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): + def __init__(self, config, add_pooling_layer=False): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + self.sep_token_id = config.sep_token_id + + self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer) + self.qa_classifier = BigBirdForQuestionAnsweringHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + question_lengths: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BigBirdForQuestionAnsweringModelOutput, tuple[torch.FloatTensor]]: + r""" + question_lengths (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*): + The lengths of the questions in the batch. + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForQuestionAnswering + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> # select random article and question + >>> LONG_ARTICLE = squad_ds[81514]["context"] + >>> QUESTION = squad_ds[81514]["question"] + >>> QUESTION + 'During daytime how high can the temperatures reach?' + + >>> inputs = tokenizer(QUESTION, LONG_ARTICLE, return_tensors="pt") + >>> # long article and question input + >>> list(inputs["input_ids"].shape) + [1, 929] + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> answer_start_index = outputs.start_logits.argmax() + >>> answer_end_index = outputs.end_logits.argmax() + >>> predict_answer_token_ids = inputs.input_ids[0, answer_start_index : answer_end_index + 1] + >>> predict_answer_token = tokenizer.decode(predict_answer_token_ids) + ``` + + ```python + >>> target_start_index, target_end_index = torch.tensor([130]), torch.tensor([132]) + >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index) + >>> loss = outputs.loss + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + seqlen = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1) + + if question_lengths is None and input_ids is not None: + # assuming input_ids format: context + question_lengths = torch.argmax(input_ids.eq(self.sep_token_id).int(), dim=-1) + 1 + question_lengths.unsqueeze_(1) + + logits_mask = None + if question_lengths is not None: + # setting lengths logits to `-inf` + logits_mask = self.prepare_question_mask(question_lengths, seqlen) + if token_type_ids is None: + token_type_ids = torch.ones(logits_mask.size(), dtype=int, device=logits_mask.device) - logits_mask + logits_mask[:, 0] = False + logits_mask.unsqueeze_(2) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.qa_classifier(sequence_output) + + if logits_mask is not None: + # removing question tokens from the competition + logits = logits - logits_mask * 1e6 + + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BigBirdForQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + pooler_output=outputs.pooler_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @staticmethod + def prepare_question_mask(q_lengths: torch.Tensor, maxlen: int): + # q_lengths -> (bz, 1) + mask = torch.arange(0, maxlen).to(q_lengths.device) + mask.unsqueeze_(0) # -> (1, maxlen) + mask = torch.where(mask < q_lengths, 1, 0) + return mask + + +__all__ = [ + "BigBirdForCausalLM", + "BigBirdForMaskedLM", + "BigBirdForMultipleChoice", + "BigBirdForPreTraining", + "BigBirdForQuestionAnswering", + "BigBirdForSequenceClassification", + "BigBirdForTokenClassification", + "BigBirdLayer", + "BigBirdModel", + "BigBirdPreTrainedModel", + "load_tf_weights_in_big_bird", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..11dcb30f3d47e3ecf6d90d7a7305682d687e5828 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py @@ -0,0 +1,2648 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen import partitioning as nn_partitioning +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxBaseModelOutputWithPooling, + FlaxBaseModelOutputWithPoolingAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_big_bird import BigBirdConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base" +_CONFIG_FOR_DOC = "BigBirdConfig" + +remat = nn_partitioning.remat + + +@flax.struct.dataclass +class FlaxBigBirdForPreTrainingOutput(ModelOutput): + """ + Output type of [`BigBirdForPreTraining`]. + + Args: + prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: jnp.ndarray = None + seq_relationship_logits: jnp.ndarray = None + hidden_states: Optional[tuple[jnp.ndarray]] = None + attentions: Optional[tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBigBirdForQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + pooled_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`): + pooled_output returned by FlaxBigBirdModel. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + start_logits: jnp.ndarray = None + end_logits: jnp.ndarray = None + pooled_output: jnp.ndarray = None + hidden_states: Optional[tuple[jnp.ndarray]] = None + attentions: Optional[tuple[jnp.ndarray]] = None + + +BIG_BIRD_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BIG_BIRD_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + +""" + + +class FlaxBigBirdEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.setup + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + if self.config.rescale_embeddings: + inputs_embeds *= self.config.hidden_size**0.5 + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->BigBird +class FlaxBigBirdSelfAttention(nn.Module): + config: BigBirdConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.head_dim = self.config.hidden_size // self.config.num_attention_heads + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,)) + + @nn.compact + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic=True, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.query(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.key(key_value_states) + value_states = self.value(key_value_states) + else: + # self_attention + key_states = self.key(hidden_states) + value_states = self.value(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxBigBirdBlockSparseAttention(nn.Module): + config: BigBirdConfig + block_sparse_seed: Optional[int] = None + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + @staticmethod + def transpose_for_scores(x, n_heads, head_size): + new_x_shape = x.shape[:-1] + (n_heads, head_size) + x = x.reshape(*new_x_shape) + return jnp.transpose(x, axes=(0, 2, 1, 3)) + + def __call__( + self, + hidden_states, + attention_mask, + deterministic=True, + output_attentions=False, + ): + n_heads = self.config.num_attention_heads + head_size = self.config.hidden_size // n_heads + + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.config.block_size + ) + + query_layer = self.transpose_for_scores(self.query(hidden_states), n_heads, head_size) + key_layer = self.transpose_for_scores(self.key(hidden_states), n_heads, head_size) + value_layer = self.transpose_for_scores(self.value(hidden_states), n_heads, head_size) + + indices_prng_key = None + if not deterministic: + indices_prng_key = self.make_rng("indices") + + attn_output, attn_weights = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + blocked_encoder_mask, + n_heads, + head_size, + indices_prng_key=indices_prng_key, + deterministic=deterministic, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + @staticmethod + def create_masks_for_block_sparse_attn(attention_mask, block_size: int): + batch_size, seq_length = attention_mask.shape + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = jnp.concatenate( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], axis=2 + ) + band_mask = jnp.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask = jnp.expand_dims(band_mask, 1) + return band_mask + + blocked_encoder_mask = attention_mask.reshape(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.reshape(batch_size, 1, seq_length, 1) + to_mask = attention_mask.reshape(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + head_size, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=None, + ): + # BigBird block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of + # shifting tokens (for calculating sliding attention). hence following code can be divided into 5 parts. + + bsz, _, from_seq_len, _ = query_layer.shape + to_seq_len = key_layer.shape[2] + from_block_size = to_block_size = self.config.block_size + + if from_seq_len % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_len % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + n_rand_blocks = self.config.num_random_blocks + rsqrt_d = 1 / jnp.sqrt(head_size) + attn_mask_penalty = -10000.0 + + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + max_seqlen = self.config.max_position_embeddings + rand_attn = [ + self._bigbird_block_rand_mask( + max_seqlen, + max_seqlen, + from_block_size, + to_block_size, + n_rand_blocks, + indices_prng_key=indices_prng_key, + deterministic=deterministic, + last_idx=1024, + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + indices_prng_key=indices_prng_key, + ) + + rand_attn = jnp.stack(rand_attn, axis=0) + rand_attn = jnp.broadcast_to(rand_attn, (bsz,) + rand_attn.shape) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.reshape(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + shape = (bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1) + gathered_key = self.jax_gather(blocked_key_matrix, rand_attn, batch_dims=2).reshape(*shape) + gathered_value = self.jax_gather(blocked_value_matrix, rand_attn, batch_dims=2).reshape(*shape) + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 0], key_layer) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = jax.nn.softmax(first_product, axis=-1) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = jnp.einsum("bhqk,bhkd->bhqd", first_attn_weights, value_layer) + first_context_layer = jnp.expand_dims(first_context_layer, 2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = jnp.concatenate( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + axis=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = jnp.concatenate( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + axis=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 1], second_key_mat) + second_seq_pad = jnp.concatenate( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype), + ], + axis=3, + ) + second_rand_pad = jnp.concatenate( + [ + jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype), + rand_mask[:, :, 0], + ], + axis=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - jnp.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = jax.nn.softmax( + second_product, axis=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+r)*to_block_size] x [bsz, n_heads, (4+r)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_attn_weights, second_value_mat) + second_context_layer = jnp.expand_dims(second_context_layer, 2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = jnp.concatenate( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], axis=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = jnp.concatenate( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + axis=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, exp_blocked_key_matrix) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, gathered_key[:, :, 1:-1]) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]) + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]) + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, :to_block_size], 3)) * attn_mask_penalty + last_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, -to_block_size:], 3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = jnp.concatenate( + [first_band_product, inner_band_product, rand_band_product, last_band_product], axis=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = jax.nn.softmax( + band_product, axis=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] + # x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = jnp.einsum( + "bhlqk,bhlkd->bhlqd", attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhlkd->bhlqd", + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], + gathered_value[:, :, 1:-1], + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = jnp.concatenate( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + axis=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = jnp.concatenate( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + axis=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -2], second_last_key_mat) + second_last_seq_pad = jnp.concatenate( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype), + ], + axis=3, + ) + second_last_rand_pad = jnp.concatenate( + [ + jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype), + rand_mask[:, :, -1], + ], + axis=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - jnp.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = jax.nn.softmax( + second_last_product, axis=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_last_attn_weights, second_last_value_mat) + second_last_context_layer = jnp.expand_dims(second_last_context_layer, 2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -1], key_layer) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = jax.nn.softmax(last_product, axis=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", last_attn_weights, value_layer) + last_context_layer = jnp.expand_dims(last_context_layer, 2) + + # combining representations of all tokens + context_layer = jnp.concatenate( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + axis=2, + ) + context_layer = context_layer.reshape(bsz, n_heads, from_seq_len, -1) * from_mask + context_layer = jnp.transpose(context_layer, axes=(0, 2, 1, 3)).reshape(bsz, from_seq_len, -1) + + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def jax_gather(params, indices, batch_dims=2): + """ + Gather the indices from params correctly (equivalent to tf.gather but with modifications) + + Args: + params: (bsz, n_heads, num_blocks, block_size, head_dim) + indices: (bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + @staticmethod + def _bigbird_block_rand_mask( + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + last_idx: Optional[int] = -1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations. + deterministic: bool. When False random attention will be used. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + rand_attn = jnp.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=jnp.int32) + # deterministic nor randomness + if deterministic: + return rand_attn + + middle_seq = jnp.arange(1, to_seq_length // to_block_size - 1, dtype=jnp.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[2:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif i == 2: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[3:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif i == from_seq_length // from_block_size - 3: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif (end + 1) == last: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + else: + concat_values = jnp.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + seq_values = jax.random.permutation(indices_prng_key, concat_values)[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are chosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations. + deterministic: bool. When False random attention will be used. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = jnp.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + jnp.zeros((num_blocks, sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=jnp.int32) + for i in range(num_heads) + ] + + # deterministic + if deterministic: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start from plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = ( + rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = ( + rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = jnp.arange(to_start_block_id, to_end_block_id, dtype=jnp.int32) + # permute the blocks + perm_block = jax.random.permutation(indices_prng_key, to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blocks = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: + break + return jnp.array(selected_random_blocks, dtype=jnp.int32) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->BigBird +class FlaxBigBirdSelfOutput(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class FlaxBigBirdAttention(nn.Module): + config: BigBirdConfig + layer_id: Optional[int] = None + causal: bool = False + dtype: jnp.dtype = jnp.float32 + + def setup(self): + if self.config.attention_type == "original_full": + self.self = FlaxBigBirdSelfAttention(self.config, causal=self.causal, dtype=self.dtype) + elif self.config.attention_type == "block_sparse": + self.self = FlaxBigBirdBlockSparseAttention(self.config, block_sparse_seed=self.layer_id, dtype=self.dtype) + else: + raise ValueError( + f"Your `config.attention_type` is {self.config.attention_type} but it can either be `original_full` or" + " `block_sparse`" + ) + + self.output = FlaxBigBirdSelfOutput(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states=None, + init_cache=False, + deterministic=True, + output_attentions: bool = False, + ): + # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length) + # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable + # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length) + if self.config.attention_type == "original_full": + attn_outputs = self.self( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=key_value_states, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + else: + attn_outputs = self.self( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->BigBird +class FlaxBigBirdIntermediate(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->BigBird +class FlaxBigBirdOutput(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states, attention_output, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + attention_output) + return hidden_states + + +class FlaxBigBirdLayer(nn.Module): + config: BigBirdConfig + layer_id: Optional[int] = None + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxBigBirdAttention( + self.config, layer_id=self.layer_id, causal=self.config.is_decoder, dtype=self.dtype + ) + self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype) + if self.config.add_cross_attention: + self.crossattention = FlaxBigBirdAttention(self.config, causal=False, dtype=self.dtype) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer.__call__ with Bert->BigBird + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + ): + # Self Attention + attention_outputs = self.attention( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + # Cross-Attention Block + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=encoder_hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + + hidden_states = self.intermediate(attention_output) + hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + if encoder_hidden_states is not None: + outputs += (cross_attention_outputs[1],) + return outputs + + +class FlaxBigBirdLayerCollection(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + if self.gradient_checkpointing: + FlaxBigBirdCheckpointLayer = remat(FlaxBigBirdLayer, static_argnums=(5, 6, 7)) + self.layers = [ + FlaxBigBirdCheckpointLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + else: + self.layers = [ + FlaxBigBirdLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection.__call__ with Bert->BigBird + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # Check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.shape[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for " + f" {head_mask.shape[0]}." + ) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask, + head_mask[i] if head_mask is not None else None, + encoder_hidden_states, + encoder_attention_mask, + init_cache, + deterministic, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->BigBird +class FlaxBigBirdEncoder(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.layer = FlaxBigBirdLayerCollection( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPredictionHeadTransform with Bert->BigBird +class FlaxBigBirdPredictionHeadTransform(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.activation = ACT2FN[self.config.hidden_act] + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return self.LayerNorm(hidden_states) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->BigBird, np.ndarray->jnp.ndarray +class FlaxBigBirdLMPredictionHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.transform = FlaxBigBirdPredictionHeadTransform(self.config, dtype=self.dtype) + self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False) + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.transform(hidden_states) + + if shared_embedding is not None: + hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + hidden_states = self.decoder(hidden_states) + + bias = jnp.asarray(self.bias, self.dtype) + hidden_states += bias + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOnlyMLMHead with Bert->BigBird +class FlaxBigBirdOnlyMLMHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding) + return hidden_states + + +class FlaxBigBirdPreTrainingHeads(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype) + self.seq_relationship = nn.Dense(2, dtype=self.dtype) + + def __call__(self, hidden_states, pooled_output, shared_embedding=None): + prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class FlaxBigBirdPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BigBirdConfig + base_model_prefix = "bert" + module_class: nn.Module = None + + def __init__( + self, + config: BigBirdConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + gradient_checkpointing: bool = False, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) + if config.attention_type == "block_sparse" and input_shape is None: + input_shape = (1, 12 * config.block_size) + elif input_shape is None: + input_shape = (1, 1) + + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing + def enable_gradient_checkpointing(self): + self._module = self.module_class( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=True, + ) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.zeros_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + params_rng, dropout_rng, indices_rng = jax.random.split(rng, num=3) + rngs = {"params": params_rng, "dropout": dropout_rng, "indices": indices_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + return_dict=False, + ) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + params: Optional[dict] = None, + dropout_rng: Optional[jax.random.PRNGKey] = None, + indices_rng: Optional[jax.random.PRNGKey] = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + past_key_values: Optional[dict] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + # Handle any PRNG if needed + rngs = {} + if indices_rng is not None: + rngs["indices"] = indices_rng + + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + if self.config.add_cross_attention: + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBigBirdAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + else: + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + return outputs + + +class FlaxBigBirdModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + add_pooling_layer: bool = True + gradient_checkpointing: bool = False + + def setup(self): + self.embeddings = FlaxBigBirdEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxBigBirdEncoder( + self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.pooler = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + hidden_states = self.embeddings( + input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic + ) + outputs = self.encoder( + hidden_states, + attention_mask, + head_mask=head_mask, + deterministic=deterministic, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + + pooled = nn.tanh(self.pooler(hidden_states[:, 0, :])) if self.add_pooling_layer else None + + if not return_dict: + # if pooled is None, don't return it + if pooled is None: + return (hidden_states,) + outputs[1:] + return (hidden_states, pooled) + outputs[1:] + + return FlaxBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=hidden_states, + pooler_output=pooled, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The bare BigBird Model transformer outputting raw hidden-states without any specific head on top.", + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModel with Bert->BigBird +class FlaxBigBirdModel(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdModule + + +append_call_sample_docstring(FlaxBigBirdModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingModule with Bert->BigBird +class FlaxBigBirdForPreTrainingModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdPreTrainingHeads(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + hidden_states = outputs[0] + pooled_output = outputs[1] + + prediction_scores, seq_relationship_score = self.cls( + hidden_states, pooled_output, shared_embedding=shared_embedding + ) + + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] + + return FlaxBigBirdForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTraining with Bert->BigBird +class FlaxBigBirdForPreTraining(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForPreTrainingModule + + +FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBigBirdForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = FlaxBigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` +""" + +overwrite_call_docstring( + FlaxBigBirdForPreTraining, + BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING, +) +append_replace_return_docstrings( + FlaxBigBirdForPreTraining, output_type=FlaxBigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLMModule with Bert->BigBird +class FlaxBigBirdForMaskedLMModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLM with Bert->BigBird +class FlaxBigBirdForMaskedLM(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForMaskedLMModule + + +append_call_sample_docstring(FlaxBigBirdForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxBigBirdClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, features, deterministic=True): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, deterministic=deterministic) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x, deterministic=deterministic) + x = self.out_proj(x) + return x + + +class FlaxBigBirdForSequenceClassificationModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.classifier = FlaxBigBirdClassificationHead(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output, deterministic=deterministic) + + if not return_dict: + return (logits,) + outputs[2:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForSequenceClassification with Bert->BigBird +class FlaxBigBirdForSequenceClassification(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxBigBirdForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->BigBird +class FlaxBigBirdForMultipleChoiceModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +class FlaxBigBirdForMultipleChoice(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForMultipleChoiceModule + + def __init__( + self, + config: BigBirdConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + if config.attention_type == "block_sparse" and input_shape is None: + input_shape = (1, 1, 12 * config.block_size) + elif input_shape is None: + input_shape = (1, 1) + super().__init__(config, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + +overwrite_call_docstring( + FlaxBigBirdForMultipleChoice, BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxBigBirdForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->BigBird +class FlaxBigBirdForTokenClassificationModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassification with Bert->BigBird +class FlaxBigBirdForTokenClassification(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForTokenClassificationModule + + +append_call_sample_docstring( + FlaxBigBirdForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBigBirdForQuestionAnsweringHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, encoder_output, deterministic=True): + hidden_states = self.dropout(encoder_output, deterministic=deterministic) + hidden_states = self.intermediate(hidden_states) + hidden_states = self.output(hidden_states, encoder_output) + hidden_states = self.qa_outputs(hidden_states) + return hidden_states + + +class FlaxBigBirdForQuestionAnsweringModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + add_pooling_layer: bool = False + gradient_checkpointing: bool = False + + def setup(self): + self.config.num_labels = 2 + self.bert = FlaxBigBirdModule( + self.config, + dtype=self.dtype, + add_pooling_layer=self.add_pooling_layer, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.qa_classifier = FlaxBigBirdForQuestionAnsweringHead(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + logits_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + pooled_output = outputs[1] if self.add_pooling_layer else None + logits = self.qa_classifier(hidden_states, deterministic=deterministic) + + if logits_mask is not None: + # removing question tokens from the competition + logits = logits - logits_mask * 1e6 + + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxBigBirdForQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + pooled_output=pooled_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BIG_BIRD_START_DOCSTRING, +) +class FlaxBigBirdForQuestionAnswering(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForQuestionAnsweringModule + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + question_lengths=None, + params: Optional[dict] = None, + dropout_rng: Optional[jax.random.PRNGKey] = None, + indices_rng: Optional[jax.random.PRNGKey] = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + if question_lengths is None and input_ids is not None: + # assuming input_ids format: context + question_lengths = jnp.argmax((input_ids == self.config.sep_token_id).astype("i4"), axis=-1) + 1 + question_lengths = jnp.expand_dims(question_lengths, axis=1) + + seqlen = input_ids.shape[1] + + logits_mask = None + if question_lengths is not None: + # setting lengths logits to `-inf` + logits_mask = self.prepare_question_mask(question_lengths, seqlen) + if token_type_ids is None: + token_type_ids = (~logits_mask).astype("i4") + logits_mask = jnp.expand_dims(logits_mask, axis=2) + logits_mask = logits_mask.at[:, 0].set(False) + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + if indices_rng is not None: + rngs["indices"] = indices_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids, + jnp.array(position_ids, dtype="i4"), + jnp.array(head_mask, dtype="i4"), + logits_mask, + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + @staticmethod + def prepare_question_mask(q_lengths, maxlen: int): + # q_lengths -> (bz, 1) + mask = jnp.arange(0, maxlen) + mask = jnp.expand_dims(mask, axis=0) < q_lengths + return mask + + +append_call_sample_docstring( + FlaxBigBirdForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxBigBirdForQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBigBirdForCausalLMModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + token_type_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for + autoregressive tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLM with Bert->BigBird +class FlaxBigBirdForCausalLM(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxBigBirdForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxBigBirdForCausalLM", + "FlaxBigBirdForMaskedLM", + "FlaxBigBirdForMultipleChoice", + "FlaxBigBirdForPreTraining", + "FlaxBigBirdForQuestionAnswering", + "FlaxBigBirdForSequenceClassification", + "FlaxBigBirdForTokenClassification", + "FlaxBigBirdModel", + "FlaxBigBirdPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..56680972a63e6360eaaefc023e57bd6beff71b9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py @@ -0,0 +1,303 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for BigBird.""" + +import os +import re +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} + + +@requires(backends=("sentencepiece",)) +class BigBirdTokenizer(PreTrainedTokenizer): + """ + Construct a BigBird tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `""`): + The begin of sequence token. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + prefix_tokens: list[int] = [] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sep_token="[SEP]", + mask_token="[MASK]", + cls_token="[CLS]", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + sep_token=sep_token, + mask_token=mask_token, + cls_token=cls_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + return self.sp_model.get_piece_size() + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def _tokenize(self, text: str) -> list[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def _decode( + self, + token_ids: list[int], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + spaces_between_special_tokens: bool = True, + **kwargs, + ) -> str: + self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) + + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separately for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + if token in self.added_tokens_encoder: + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + + # Mimic the behavior of the Rust tokenizer: + # No space before [MASK] and [SEP] + if spaces_between_special_tokens: + text = re.sub(r" (\[(MASK|SEP)\])", r"\1", " ".join(sub_texts)) + else: + text = "".join(sub_texts) + + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Big Bird sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + +__all__ = ["BigBirdTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..6148585a40b10385098c69714f650e1f42c22b4f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py @@ -0,0 +1,198 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Big Bird model.""" + +import os +from shutil import copyfile +from typing import Optional + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_big_bird import BigBirdTokenizer +else: + BigBirdTokenizer = None + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class BigBirdTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This + tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token + that is used for the end of sequence. The token used is the `sep_token`. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = BigBirdTokenizer + model_input_names = ["input_ids", "attention_mask"] + prefix_tokens: list[int] = [] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sep_token="[SEP]", + mask_token="[MASK]", + cls_token="[CLS]", + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An BigBird sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Set to True if the token list is already formatted with special tokens for the model + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0] + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["BigBirdTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c135bcf0ef11c065b4b7ca03e92060f6cadb7719 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5410a327c09c64d10f7b6c3e01d58baeb49a83af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5db3ac279bc72cfbd90f251935f447af77ce93d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65f260d957a0a457a35d1ef29d0a9740682b12e1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b813295c4455316449134cd685ec05cefdba90b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b897a61ace0764eecfc2945901556b8b1c9de93c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f039a45ce751e82d060b3729567e03c1738f03b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da70e23122d642dc6fb75a5c7718e2e8573cf395 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79d9eeee3ae84923418ebe3d36df5c3b089eac15 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3428dd3800b58e8052c53cbd71adb7dce8ea451 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bitnet import * + from .modeling_bitnet import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..6df31b5b8200e3b30eff59a270708984a90c35ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +"""BitNet model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BitNetConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BitNetModel`]. It is used to instantiate an BitNet + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of + BitNet b1.58 2B4T [microsoft/bitnet-b1.58-2B-4T](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 128256): + Vocabulary size of the BitNet model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BitNetModel`] + hidden_size (`int`, *optional*, defaults to 2560): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 6912): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 30): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 20): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 5): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 128000): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 128001): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 500000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import BitNetModel, BitNetConfig + + >>> # Initializing a BitNet style configuration + >>> configuration = BitNetConfig() + + >>> # Initializing a model from the BitNet style configuration + >>> model = BitNetModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bitnet" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=128256, + hidden_size=2560, + intermediate_size=6912, + num_hidden_layers=30, + num_attention_heads=20, + num_key_value_heads=5, + hidden_act="relu2", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=128000, + eos_token_id=128001, + tie_word_embeddings=False, + rope_theta=500000.0, + attention_bias=False, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["BitNetConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..0af7c794b155fa989da445960376069885b30bf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py @@ -0,0 +1,488 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/bitnet/modular_bitnet.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_bitnet.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_bitnet import BitNetConfig + + +@use_kernel_forward_from_hub("RMSNorm") +class BitNetRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + BitNetRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class BitNetMLP(nn.Module): + def __init__(self, config: BitNetConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + self.ffn_sub_norm = BitNetRMSNorm(config.intermediate_size, eps=config.rms_norm_eps) + + def forward(self, x): + down_proj = self.down_proj(self.ffn_sub_norm(self.act_fn(self.gate_proj(x)) * self.up_proj(x))) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class BitNetAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: BitNetConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.attn_sub_norm(attn_output) # diff with Llama + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class BitNetDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BitNetConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = BitNetAttention(config=config, layer_idx=layer_idx) + + self.mlp = BitNetMLP(config) + self.input_layernorm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class BitNetRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: BitNetConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class BitNetPreTrainedModel(PreTrainedModel): + config: BitNetConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BitNetDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": BitNetDecoderLayer, + "attentions": BitNetAttention, + } + + +@auto_docstring +class BitNetModel(BitNetPreTrainedModel): + def __init__(self, config: BitNetConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [BitNetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = BitNetRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class BitNetForCausalLM(BitNetPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = None + _pp_plan = None + + def __init__(self, config): + super().__init__(config) + self.model = BitNetModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BitNetForCausalLM + + >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + + >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: ' + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=100) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?" + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["BitNetForCausalLM", "BitNetModel", "BitNetPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..92ad3a32145992eb5b8a6bbac0af97f079ef2352 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +"""PyTorch BitNet model.""" + +from typing import Callable, Optional + +import torch + +from ...cache_utils import Cache +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import CausalLMOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg +from ..gemma.modeling_gemma import GemmaMLP +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaModel, + LlamaRMSNorm, + apply_rotary_pos_emb, + eager_attention_forward, +) +from .configuration_bitnet import BitNetConfig + + +logger = logging.get_logger(__name__) + + +class BitNetRMSNorm(LlamaRMSNorm): + pass + + +class BitNetMLP(GemmaMLP): + def __init__(self, config: BitNetConfig): + super().__init__(config) + self.ffn_sub_norm = BitNetRMSNorm(config.intermediate_size, eps=config.rms_norm_eps) + + def forward(self, x): + down_proj = self.down_proj(self.ffn_sub_norm(self.act_fn(self.gate_proj(x)) * self.up_proj(x))) + return down_proj + + +class BitNetAttention(LlamaAttention): + def __init__(self, config: BitNetConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.attn_sub_norm(attn_output) # diff with Llama + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class BitNetDecoderLayer(LlamaDecoderLayer): + pass + + +class BitNetModel(LlamaModel): + pass + + +class BitNetForCausalLM(LlamaForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = None + _pp_plan = None + + def forward( + self, + **super_kwargs, + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BitNetForCausalLM + + >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + + >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: ' + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=100) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?" + ```""" + return super().forward(**super_kwargs) + + +__all__ = [ + "BitNetForCausalLM", + "BitNetModel", + "BitNetPreTrainedModel", # noqa: F822 +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4da453659b949fedd47b36565f445d02a52ae46 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/configuration_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/configuration_blenderbot_small.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b142562ba9991f137031e7db5e633eaf70ae4ef5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/configuration_blenderbot_small.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_blenderbot_small.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6af91616debfa513303c655a72a94d9781bd8fc9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_blenderbot_small.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_flax_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_flax_blenderbot_small.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3c85e63984c60b4cd4e1860691051cefcfb129e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_flax_blenderbot_small.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_tf_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_tf_blenderbot_small.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eeca7219efa297ce075e5f2283776dbaade13519 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_tf_blenderbot_small.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1778d3be458baf23f9f45300df71644075d5f7bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1b57dd123cfc3132bad6042296daf8d2250598c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..952de2f855a71591e950b31751af1fe6cfbae20a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_blip import * + from .image_processing_blip import * + from .image_processing_blip_fast import * + from .modeling_blip import * + from .modeling_blip_text import * + from .modeling_tf_blip import * + from .modeling_tf_blip_text import * + from .processing_blip import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..6e0a5590c3f769896675e801f76ecfa9234eb8be --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py @@ -0,0 +1,317 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Blip model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BlipTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP + text model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the `BlipText` used by the [base + architectures](https://huggingface.co/Salesforce/blip-vqa-base). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30524): + Vocabulary size of the `Blip` text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`BlipModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + encoder_hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers from the vision model. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + bos_token_id (`int`, *optional*, defaults to 30522): + The id of the `beginning-of-sequence` token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the `end-of-sequence` token. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the `padding` token. + sep_token_id (`int`, *optional*, defaults to 102): + The id of the `separator` token. + is_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as a decoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + label_smoothing (float, *optional*): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. + + Example: + + ```python + >>> from transformers import BlipTextConfig, BlipTextModel + + >>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipTextConfig() + + >>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=30524, + hidden_size=768, + encoder_hidden_size=768, + intermediate_size=3072, + projection_dim=768, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=512, + hidden_act="gelu", + layer_norm_eps=1e-12, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + bos_token_id=30522, + eos_token_id=2, + pad_token_id=0, + sep_token_id=102, + is_decoder=True, + use_cache=True, + label_smoothing=0.0, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.encoder_hidden_size = encoder_hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.hidden_dropout_prob = hidden_dropout_prob + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.is_decoder = is_decoder + self.use_cache = use_cache + self.label_smoothing = label_smoothing + + +class BlipVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a + BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a + configuration defaults will yield a similar configuration to that of the Blip-base + [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 384): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 1e-10): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Example: + + ```python + >>> from transformers import BlipVisionConfig, BlipVisionModel + + >>> # Initializing a BlipVisionConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipVisionConfig() + + >>> # Initializing a BlipVisionModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + image_size=384, + patch_size=16, + hidden_act="gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=1e-10, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + +class BlipConfig(PretrainedConfig): + r""" + [`BlipConfig`] is the configuration class to store the configuration of a [`BlipModel`]. It is used to instantiate + a BLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating + a configuration with the defaults will yield a similar configuration to that of the BLIP-base + [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BlipTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BlipVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original BLIP implementation. + image_text_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden state of the image-text fusion layer. + label_smoothing (float, optional, *optional*, defaults to 0.0): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import BlipConfig, BlipModel + + >>> # Initializing a BlipConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipConfig() + + >>> # Initializing a BlipPModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a BlipConfig from a BlipTextConfig and a BlipVisionConfig + + >>> # Initializing a BLIPText and BLIPVision configuration + >>> config_text = BlipTextConfig() + >>> config_vision = BlipVisionConfig() + + >>> config = BlipConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "blip" + sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig} + + def __init__( + self, + text_config=None, + vision_config=None, + projection_dim=512, + logit_scale_init_value=2.6592, + image_text_hidden_size=256, + label_smoothing=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.") + + self.text_config = BlipTextConfig(**text_config) + self.vision_config = BlipVisionConfig(**vision_config) + + self.text_config.encoder_hidden_size = self.vision_config.hidden_size + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + self.image_text_hidden_size = image_text_hidden_size + self.label_smoothing = label_smoothing + + +__all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..78a152374fd0089846f4431c8b408da4c5c9ff54 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py @@ -0,0 +1,297 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BLIP.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class BlipImageProcessor(BaseImageProcessor): + r""" + Constructs a BLIP image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: Optional[bool] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + +__all__ = ["BlipImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..2b1d14f139049162590bcaabf923a08b194aaae5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py @@ -0,0 +1,36 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for BLIP.""" + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import auto_docstring + + +@auto_docstring +class BlipImageProcessorFast(BaseImageProcessorFast): + # To be checked against the slow image processor + # None values left after checking can be removed + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 384, "width": 384} + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["BlipImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..8a58add38af3a68befc369376f4023b461c512b4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py @@ -0,0 +1,1308 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BLIP model.""" + +import warnings +from dataclasses import dataclass +from typing import Any, Optional, Union + +import torch +from torch import nn +from torch.nn.functional import normalize + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int +from ...utils.generic import check_model_inputs +from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig +from .modeling_blip_text import BlipTextLMHeadModel, BlipTextModel + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.clip.modeling_clip.contrastive_loss +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->blip +def blip_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +@auto_docstring( + custom_intro=""" + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + """ +) +class BlipForConditionalGenerationModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Language modeling loss from the text decoder. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): + Prediction scores of the language modeling head of the text decoder model. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*): + The image embeddings obtained after applying the Vision Transformer model to the input image. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tuple[torch.FloatTensor]] = None + logits: Optional[tuple[torch.FloatTensor]] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + @property + def decoder_logits(self): + warnings.warn( + "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the `logits` attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.logits + + +@dataclass +@auto_docstring( + custom_intro=""" + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + """ +) +class BlipTextVisionModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + """ + + loss: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity + scores. + """ +) +class BlipImageTextMatchingModelOutput(ModelOutput): + r""" + itm_score (`torch.FloatTensor`): + The image-text similarity scores. + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*): + Last layer hidden-state of the vision of the vision-only branch of the model. + question_embeds (`torch.FloatTensor`): + The question embeddings obtained by the text projection layer. + """ + + itm_score: Optional[torch.FloatTensor] = None + loss: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + vision_pooler_output: Optional[torch.FloatTensor] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + question_embeds: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring +class BlipOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`]. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`BlipTextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`BlipVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class BlipVisionEmbeddings(nn.Module): + def __init__(self, config: BlipVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding + + class_pos_embed = self.position_embedding[:, :1] + patch_pos_embed = self.position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + position_embedding = self.interpolate_pos_encoding(embeddings, height, width) + else: + position_embedding = self.position_embedding + embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype) + return embeddings + + +# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Blip +class BlipTextEmbeddings(nn.Module): + def __init__(self, config: BlipTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + max_position_embedding = self.position_embedding.weight.shape[0] + + if seq_length > max_position_embedding: + raise ValueError( + f"Sequence length must be less than max_position_embeddings (got `sequence length`: " + f"{seq_length} and max_position_embeddings: {max_position_embedding}" + ) + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +class BlipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + mixed_qkv = ( + self.qkv(hidden_states) + .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3) + + new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,) + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + return output, attention_probs + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip +class BlipMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class BlipEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BlipConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = BlipAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = BlipMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + @auto_docstring + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + **kwargs: Unpack[TransformersKwargs], + ) -> torch.FloatTensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + **kwargs, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + return hidden_states + + +@auto_docstring +class BlipPreTrainedModel(PreTrainedModel): + config: BlipConfig + base_model_prefix = "blip" + supports_gradient_checkpointing = True + _no_split_modules = ["BlipEncoderLayer", "BlipTextEmbeddings"] + _skip_keys_device_placement = ["past_key_values"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, (nn.Conv2d, nn.Embedding, nn.Linear)): + module.weight.data.normal_(mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, BlipVisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + nn.init.trunc_normal_( + module.position_embedding, + mean=0.0, + std=factor, + ) + + nn.init.trunc_normal_( + module.class_embedding, + mean=0.0, + std=factor, + ) + + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BlipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`BlipEncoderLayer`]. + + Args: + config (`BlipConfig`): + The corresponding vision configuration for the `BlipEncoder`. + """ + + def __init__(self, config: BlipConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + @auto_docstring + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutput]: + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer( + hidden_states, + attention_mask=attention_mask, + **kwargs, + ) + + return BaseModelOutput(last_hidden_state=hidden_states) + + +class BlipVisionModel(BlipPreTrainedModel): + main_input_name = "pixel_values" + config: BlipVisionConfig + _can_record_outputs = { + "hidden_states": BlipEncoderLayer, + "attentions": BlipAttention, + } + + def __init__(self, config: BlipVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = BlipVisionEmbeddings(config) + self.encoder = BlipEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + self.post_init() + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutputWithPooling]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + encoder_outputs: BaseModelOutput = self.encoder( + inputs_embeds=hidden_states, + **kwargs, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + ) + + def get_input_embeddings(self): + return self.embeddings + + +@auto_docstring( + custom_intro=""" + This model is going to be deprecated in future versions. Please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase. + """ +) +class BlipModel(BlipPreTrainedModel): + config: BlipConfig + + def __init__(self, config: BlipConfig): + super().__init__(config) + + if not isinstance(config.text_config, BlipTextConfig): + raise TypeError( + "config.text_config is expected to be of type BlipTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, BlipVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type BlipVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = BlipTextModel(text_config) + self.vision_model = BlipVisionModel(vision_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + logger.warning( + "`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase." + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_model.set_input_embeddings(value) + + @auto_docstring + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`BlipTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + ) + + pooled_output = text_outputs[1] + text_features = self.text_projection(pooled_output) + + return text_features + + @auto_docstring + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`BlipVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) + ```""" + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @auto_docstring + def get_multimodal_features( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + multimodal_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The multimodal embeddings + obtained by applying the image embeddings to the text encoder using the cross-attention mechanism. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> texts = ["a photo of a cat", "a photo of a dog"] + >>> inputs = processor(images=image, text=texts, padding=True, return_tensors="pt") + + >>> multimodal_features = model.get_multimodal_features(**inputs) + ```""" + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + ) + + pooled_output = text_outputs[1] # pooled_output + multimodal_features = self.text_projection(pooled_output) + + return multimodal_features + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipOutput]: + r""" + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + **kwargs, + ) + + image_embeds = vision_outputs.pooler_output + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs.pooler_output + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp().to(device=text_embeds.device) + image_embeds = image_embeds.to(device=text_embeds.device, dtype=text_embeds.dtype) + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = blip_loss(logits_per_text) + + return BlipOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +@auto_docstring( + custom_intro=""" + BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass + `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise, + the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption + from the text input. If no text input is provided, the decoder will start with the [BOS] token only. + """ +) +class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin): + config: BlipConfig + _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] + main_input_name = "pixel_values" + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_decoder = BlipTextLMHeadModel(config.text_config) + + self.decoder_input_ids = config.text_config.bos_token_id + self.decoder_pad_token_id = config.text_config.pad_token_id + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_decoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_decoder.set_input_embeddings(value) + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipForConditionalGenerationModelOutput]: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForConditionalGeneration + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "A picture of" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model(**inputs) + ```""" + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + image_embeds = vision_outputs.last_hidden_state + + outputs = self.text_decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + labels=labels, + reduction="mean", + **kwargs, + ) + + return BlipForConditionalGenerationModelOutput( + loss=outputs.loss, + logits=outputs.logits, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*: + Input image to be processed + input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + The sequence used as a prompt for the generation. + attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForConditionalGeneration + + >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + two cats sleeping on a couch + ``` + """ + + batch_size = pixel_values.shape[0] + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if isinstance(input_ids, list): + input_ids = torch.LongTensor(input_ids) + elif input_ids is None: + input_ids = ( + torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + + input_ids[:, 0] = self.config.text_config.bos_token_id + attention_mask = attention_mask[:, :-1] if attention_mask is not None else None + + outputs = self.text_decoder.generate( + input_ids=input_ids[:, :-1], + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **generate_kwargs, + ) + + return outputs + + +@auto_docstring( + custom_intro=""" + BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text + decoder. The vision encoder will encode the input image, the text encoder will encode the input question together + with the encoding of the image, and the text decoder will output the answer to the question. + """ +) +class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin): + config: BlipConfig + _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False) + + self.text_decoder = BlipTextLMHeadModel(config.text_config) + + self.decoder_pad_token_id = config.text_config.pad_token_id + self.decoder_start_token_id = config.text_config.bos_token_id + + # Initialize weights and apply final processing + self.post_init() + + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) + + def get_input_embeddings(self): + # This will return shared embeddings if they are shared else specific to encoder. + return self.text_encoder.get_input_embeddings() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipTextVisionModelOutput]: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForQuestionAnswering + + >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> # training + >>> text = "How many cats are in the picture?" + >>> label = "2" + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> labels = processor(text=label, return_tensors="pt").input_ids + + >>> inputs["labels"] = labels + >>> outputs = model(**inputs) + >>> loss = outputs.loss + >>> loss.backward() + + >>> # inference + >>> text = "How many cats are in the picture?" + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ```""" + if labels is None and decoder_input_ids is None: + raise ValueError( + "Either `decoder_input_ids` or `labels` should be passed when calling `forward` with" + " `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you" + " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`" + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **kwargs, + ) + + if labels is not None and decoder_input_ids is None: + # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153 + decoder_input_ids = labels + + question_embeds = question_embeds[0] + + answer_output = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=question_embeds, + encoder_attention_mask=attention_mask, + labels=labels, + reduction="mean", + **kwargs, + ) + + if labels is not None: + decoder_loss = answer_output.loss.mean() + else: + decoder_loss = None + + return BlipTextVisionModelOutput( + loss=decoder_loss, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*): + The sequence used as a prompt for the generation. + pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*: + Input image to be processed + attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for + tokens that are NOT MASKED, `0` for MASKED tokens. + **generate_kwargs: + Additional arguments passed to the *generate* function of the decoder + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForQuestionAnswering + + >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "How many cats are in the picture?" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ``` + """ + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if isinstance(input_ids, list): + input_ids = torch.LongTensor(input_ids) + + question_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=False, + ) + + question_embeds = question_outputs[0] + + question_attention_mask = torch.ones( + question_embeds.size()[:-1], dtype=torch.long, device=question_embeds.device + ) + + bos_ids = torch.full( + (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device + ) + + outputs = self.text_decoder.generate( + input_ids=bos_ids, + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + encoder_hidden_states=question_embeds, + encoder_attention_mask=question_attention_mask, + **generate_kwargs, + ) + + return outputs + + +@auto_docstring( + custom_intro=""" + BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of + image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """ +) +class BlipForImageTextRetrieval(BlipPreTrainedModel): + config: BlipConfig + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False) + + # vision projection layer + self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size) + + # text projection layer + self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size) + + # image text matching head + self.itm_head = nn.Linear(config.text_config.hidden_size, 2) + + self.decoder_pad_token_id = ( + config.text_config.pad_token_id + if not hasattr(config, "decoder_pad_token_id") + else config.decoder_pad_token_id + ) + self.decoder_start_token_id = ( + config.text_config.bos_token_id + if not hasattr(config, "decoder_start_token_id") + else config.decoder_start_token_id + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_encoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + use_itm_head: Optional[bool] = True, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipTextVisionModelOutput]: + r""" + use_itm_head (`bool`, *optional*, defaults to `True`): + Whether or not to use the image-text matching head. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForImageTextRetrieval + + >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "an image of a cat" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + ``` + """ + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + image_embeds = vision_outputs.last_hidden_state + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + if use_itm_head: + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + **kwargs, + ) + question_embeds = question_embeds.last_hidden_state + + output = self.itm_head(question_embeds[:, 0, :]) + else: + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + **kwargs, + ) + question_embeds = question_embeds.last_hidden_state + + image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1) + + output = image_feat @ text_feat.t() + + return BlipImageTextMatchingModelOutput( + itm_score=output, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + question_embeds=question_embeds, + ) + + +__all__ = [ + "BlipModel", + "BlipPreTrainedModel", + "BlipForConditionalGeneration", + "BlipForQuestionAnswering", + "BlipVisionModel", + "BlipTextModel", + "BlipForImageTextRetrieval", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py new file mode 100644 index 0000000000000000000000000000000000000000..6f1f58c7533463ab9ba01f45a15ba51a294c0651 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py @@ -0,0 +1,968 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the BSD-3-clause license (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from typing import Optional, Union + +import torch +from torch import Tensor, device, nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_blip import BlipTextConfig + + +logger = logging.get_logger(__name__) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 +class BlipTextEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 +class BlipTextSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention, layer_idx=None): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.layer_idx = layer_idx + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = ( + self.key(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function) + attention_scores = attention_scores + attention_mask.to(attention_scores.device) + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert -> BlipText +class BlipTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 +class BlipTextAttention(nn.Module): + def __init__(self, config, is_cross_attention=False, layer_idx=None): + super().__init__() + self.self = BlipTextSelfAttention(config, is_cross_attention, layer_idx=layer_idx) + self.output = BlipTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert -> BlipText +class BlipTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert -> BlipText +class BlipTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BlipTextLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BlipTextAttention(config, layer_idx=layer_num) + self.layer_num = layer_num + if self.config.is_decoder: + self.crossattention = BlipTextAttention( + config, is_cross_attention=self.config.is_decoder, layer_idx=layer_num + ) + self.intermediate = BlipTextIntermediate(config) + self.output = BlipTextOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] + + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + return (layer_output,) + outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 +class BlipTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BlipTextLayer(config, i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache: + if isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + # The model acts as encoder decoder but is not an encoder decoder. So we cast all cache objects to + # `EncoderDecoderCache` type assuming that the incoming cache is from `self_attention` + elif isinstance(past_key_values, DynamicCache): + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config)) + elif past_key_values is None: + past_key_values = EncoderDecoderCache( + DynamicCache(config=self.config), DynamicCache(config=self.config) + ) + + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and encoder_hidden_states is not None else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + output_attentions, + cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BlipText +class BlipTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BlipText +class BlipTextPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BlipText +class BlipTextLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BlipTextPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText +class BlipTextOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BlipTextLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 +class BlipTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: BlipTextConfig + base_model_prefix = "bert" + _no_split_modules = [] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571 +class BlipTextModel(BlipTextPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an + `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BlipTextEmbeddings(config) + self.encoder = BlipTextEncoder(config) + self.pooler = BlipTextPooler(config) if add_pooling_layer else None + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, attention_mask: Tensor, input_shape: tuple[int], device: device, is_decoder: bool + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`tuple[int]`): + The shape of the input to the model. + device (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype + ), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + is_decoder: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`Cache`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif encoder_embeds is not None: + input_shape = encoder_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = encoder_embeds.device + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length)).to(device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 +class BlipTextLMHeadModel(BlipTextPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BlipTextModel(config, add_pooling_layer=False) + self.cls = BlipTextOnlyMLMHead(config) + self.label_smoothing = config.label_smoothing + + def get_input_embeddings(self): + return self.bert.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.bert.set_input_embeddings(new_embeddings) + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_logits: Optional[bool] = False, + is_decoder: Optional[bool] = True, + reduction: Optional[str] = "mean", + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is + configured as a decoder. + encoder_attention_mask (`torch.FloatTensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`Cache`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + cache_position=cache_position, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing) + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + # Overwrite -- hardcoded key return (`is_decoder=True`) + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + **model_kwargs, + ) + model_inputs["is_decoder"] = True + + return model_inputs + + +__all__ = ["BlipTextModel", "BlipTextLMHeadModel", "BlipTextPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a1f7928273c69de30b4ba1c190435d91dfd509 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py @@ -0,0 +1,1709 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow BLIP model.""" + +from __future__ import annotations + +import warnings +from dataclasses import dataclass +from typing import Any + +import tensorflow as tf + +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling +from ...modeling_tf_utils import ( + TFPreTrainedModel, + get_initializer, + get_tf_activation, + keras, + keras_serializable, + shape_list, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, stable_softmax +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig +from .modeling_tf_blip_text import BLIP_TEXT_INPUTS_DOCSTRING, TFBlipTextLMHeadModel, TFBlipTextModel + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base" + + +# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss +def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: + return tf.math.reduce_mean( + keras.metrics.sparse_categorical_crossentropy( + y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True + ) + ) + + +# Copied from transformers.models.clip.modeling_tf_clip.clip_loss with clip->blip +def blip_loss(similarity: tf.Tensor) -> tf.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(tf.transpose(similarity)) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +class TFBlipForConditionalGenerationModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`): + Language modeling loss from the text decoder. + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): + Prediction scores of the language modeling head of the text decoder model. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*): + The image embeddings obtained after applying the Vision Transformer model to the input image. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads.` + """ + + loss: tuple[tf.Tensor] | None = None + logits: tuple[tf.Tensor] | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor, ...] | None = None + attentions: tuple[tf.Tensor, ...] | None = None + + @property + def decoder_logits(self): + warnings.warn( + "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the `logits` attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.logits + + +@dataclass +class TFBlipTextVisionModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor, ...] | None = None + attentions: tuple[tf.Tensor, ...] | None = None + + +@dataclass +class TFBlipImageTextMatchingModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity + scores. + + Args: + itm_score (`tf.Tensor`): + The image-text similarity scores. + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*): + Last layer hidden-state of the vision of the vision-only branch of the model. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + question_embeds (`tf.Tensor`): + The question embeddings obtained by the text projection layer. + """ + + itm_score: tf.Tensor | None = None + loss: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor, ...] | None = None + vision_pooler_output: tf.Tensor | None = None + attentions: tuple[tf.Tensor, ...] | None = None + question_embeds: tuple[tf.Tensor] | None = None + + +@dataclass +class TFBlipOutput(ModelOutput): + """ + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`]. + image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`]. + text_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipTextModel`]. + vision_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipVisionModel`]. + """ + + loss: tf.Tensor | None = None + logits_per_image: tf.Tensor | None = None + logits_per_text: tf.Tensor | None = None + text_embeds: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + text_model_output: TFBaseModelOutputWithPooling = None + vision_model_output: TFBaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class TFBlipVisionEmbeddings(keras.layers.Layer): + def __init__(self, config: BlipVisionConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = keras.layers.Conv2D( + filters=self.embed_dim, + kernel_size=self.patch_size, + strides=self.patch_size, + kernel_initializer=get_initializer(self.config.initializer_range), + data_format="channels_last", + name="patch_embedding", + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + def build(self, input_shape=None): + self.class_embedding = self.add_weight( + shape=(1, 1, self.embed_dim), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="class_embedding", + ) + + self.position_embedding = self.add_weight( + shape=(1, self.num_positions, self.embed_dim), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="position_embedding", + ) + + if self.built: + return + self.built = True + if getattr(self, "patch_embedding", None) is not None: + with tf.name_scope(self.patch_embedding.name): + self.patch_embedding.build([None, None, None, 3]) + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch + # likes channels-first convs. + batch_size = tf.shape(pixel_values)[0] + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + patch_embeds = self.patch_embedding(pixel_values) + patch_embeds = tf.reshape(patch_embeds, (batch_size, self.num_patches, -1)) + + class_embeds = tf.broadcast_to(self.class_embedding, (batch_size, 1, self.embed_dim)) + embeddings = tf.concat([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding[:, : tf.shape(embeddings)[1], :] + return embeddings + + +# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip +class TFBlipTextEmbeddings(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + + self.config = config + + def build(self, input_shape: tf.TensorShape = None): + with tf.name_scope("token_embedding"): + self.weight = self.add_weight( + shape=(self.config.vocab_size, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="weight", + ) + + with tf.name_scope("position_embedding"): + self.position_embedding = self.add_weight( + shape=(self.config.max_position_embeddings, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="embeddings", + ) + + super().build(input_shape) + + def call( + self, + input_ids: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + position_embeds = tf.gather(params=self.position_embedding, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + final_embeddings = inputs_embeds + position_embeds + + return final_embeddings + + +class TFBlipAttention(keras.layers.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout") + + self.qkv = keras.layers.Dense( + 3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv" + ) + + self.projection = keras.layers.Dense( + self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" + ) + + def call( + self, + hidden_states: tf.Tensor, + head_mask: tf.Tensor | None = None, + output_attentions: bool | None = False, + training: bool | None = None, + ) -> tuple[tf.Tensor, tf.Tensor | None, tuple[tf.Tensor] | None]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + mixed_qkv = self.qkv(hidden_states) + mixed_qkv = tf.reshape(mixed_qkv, (bsz, tgt_len, 3, self.num_heads, self.head_dim)) + mixed_qkv = tf.transpose(mixed_qkv, perm=(2, 0, 3, 1, 4)) + + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = query_states @ tf.transpose(key_states, (0, 1, 3, 2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = tf.transpose(attention_probs @ value_states, perm=(0, 2, 1, 3)) + + new_context_layer_shape = shape_list(context_layer)[:-2] + [self.embed_dim] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build([None, None, self.embed_dim]) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, self.embed_dim]) + + +class TFBlipMLP(keras.layers.Layer): + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + + self.activation_fn = get_tf_activation(config.hidden_act) + + in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) + fc_std = (2 * config.hidden_size) ** -0.5 + + self.fc1 = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" + ) + self.fc2 = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.fc1(inputs=hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(inputs=hidden_states) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.config.hidden_size]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.intermediate_size]) + + +class TFBlipEncoderLayer(keras.layers.Layer): + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.hidden_size + self.self_attn = TFBlipAttention(config, name="self_attn") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.mlp = TFBlipMLP(config, name="mlp") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + output_attentions: bool | None = False, + training: bool | None = None, + ) -> tuple[tf.Tensor]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + training=training, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.embed_dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.embed_dim]) + + +class TFBlipPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipConfig + base_model_prefix = "blip" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + +BLIP_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BlipConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@keras_serializable +class TFBlipEncoder(keras.layers.Layer): + config_class = BlipConfig + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`BlipEncoderLayer`]. + + Args: + config (`BlipConfig`): + The corresponding vision configuration for the `BlipEncoder`. + """ + + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layers = [TFBlipEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)] + + @unpack_inputs + def call( + self, + inputs_embeds, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBaseModelOutput: + r""" + Args: + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + training=training, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFBlipVisionModel(TFBlipPreTrainedModel): + main_input_name = "pixel_values" + config_class = BlipVisionConfig + + def __init__(self, config: BlipVisionConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + self.config = config + + self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings") + self.encoder = TFBlipEncoder(config, name="encoder") + self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") + self.embed_dim = config.hidden_size + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=hs, + attentions=attns, + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig) + def call( + self, + pixel_values: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBaseModelOutputWithPooling: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + # TF gets confused if we call the layer with inputs of different ranks, so insert a singleton dimension + pooled_output = self.post_layernorm(tf.expand_dims(pooled_output, 1)) + pooled_output = tf.squeeze(pooled_output, 1) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "post_layernorm", None) is not None: + with tf.name_scope(self.post_layernorm.name): + self.post_layernorm.build([None, None, self.embed_dim]) + + +class TFBlipMainLayer(keras.layers.Layer): + config_class = BlipConfig + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(*args, **kwargs) + + if not isinstance(config.text_config, BlipTextConfig): + raise TypeError( + "config.text_config is expected to be of type BlipTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, BlipVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type BlipVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = TFBlipTextModel(text_config, name="text_model") + self.vision_model = TFBlipVisionModel(vision_config, name="vision_model") + + self.visual_projection = keras.layers.Dense( + self.projection_dim, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="visual_projection", + ) + self.text_projection = keras.layers.Dense( + self.projection_dim, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="text_projection", + ) + + self.config = config + + def build(self, input_shape=None): + self.logit_scale = self.add_weight( + name="logit_scale", + shape=[], + initializer=keras.initializers.Constant(self.config.logit_scale_init_value), + trainable=True, + ) + + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "visual_projection", None) is not None: + with tf.name_scope(self.visual_projection.name): + self.visual_projection.build([None, None, self.vision_embed_dim]) + if getattr(self, "text_projection", None) is not None: + with tf.name_scope(self.text_projection.name): + self.text_projection.build([None, None, self.text_embed_dim]) + + @unpack_inputs + def call( + self, + input_ids: tf.Tensor | None = None, + pixel_values: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_loss: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipOutput: + # Use BLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / tf.norm(image_embeds, ord=2, axis=-1, keepdims=True) + text_embeds = text_embeds / tf.norm(text_embeds, ord=2, axis=-1, keepdims=True) + + # cosine similarity as logits + logit_scale = tf.exp(self.logit_scale) + logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale + logits_per_image = tf.transpose(logits_per_text) + + loss = None + if return_loss: + loss = blip_loss(logits_per_text) + loss = tf.reshape(loss, (1,)) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return TFBlipOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +class TFBlipModel(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + main_input_name = "input_ids" + + def __init__(self, config: BlipConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.blip = TFBlipMainLayer(config, name="blip") + + def serving_output(self, output: TFBlipOutput) -> TFBlipOutput: + return TFBlipOutput( + logits_per_image=output.logits_per_image, + logits_per_text=output.logits_per_text, + text_embeds=output.text_embeds, + image_embeds=output.image_embeds, + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig) + def call( + self, + input_ids: tf.Tensor | None = None, + pixel_values: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_loss: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities + ```""" + outputs = self.blip( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_dict: bool | None = None, + ) -> tf.Tensor: + r""" + Returns: + text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying + the projection layer to the pooled output of [`TFBlipTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf") + >>> text_features = model.get_text_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.blip.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + return_dict=return_dict, + ) + + pooled_output = text_outputs[1] + text_features = self.blip.text_projection(pooled_output) + + return text_features + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: tf.Tensor | None = None, + return_dict: bool | None = None, + ) -> tf.Tensor: + r""" + Returns: + image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying + the projection layer to the pooled output of [`TFBlipVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> image_features = model.get_image_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.blip.visual_projection(pooled_output) + + return image_features + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blip", None) is not None: + with tf.name_scope(self.blip.name): + self.blip.build(None) + + +@add_start_docstrings( + """ + BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass + `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise, + the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption + from the text input. If no text input is provided, the decoder will start with the [BOS] token only. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForConditionalGeneration(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + main_input_name = "pixel_values" + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder") + + self.decoder_input_ids = config.text_config.bos_token_id + self.decoder_pad_token_id = config.text_config.pad_token_id + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig) + def call( + self, + pixel_values: tf.Tensor, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + labels: tf.Tensor | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipForConditionalGenerationModelOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "A picture of" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + + >>> outputs = model(**inputs) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + + outputs = self.text_decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + labels=labels, + return_dict=False, + training=training, + ) + + if not return_dict: + outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + if labels is not None: + loss = outputs[0] + logits = outputs[1] + else: + loss = None + logits = outputs[0] + + if loss is not None and loss.shape.rank == 0: + loss = tf.reshape(loss, (1,)) + + return TFBlipForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + def generate( + self, + pixel_values: tf.Tensor, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + **generate_kwargs, + ) -> tf.Tensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`: + Input image to be processed + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + The sequence used as a prompt for the generation. + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration + + >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + two cats sleeping on a couch + ``` + """ + + batch_size = pixel_values.shape[0] + vision_outputs = self.vision_model(pixel_values=pixel_values) + + image_embeds = vision_outputs[0] + + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32) + + if isinstance(input_ids, list): + input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32) + elif input_ids is None: + input_ids = tf.convert_to_tensor( + [[self.decoder_input_ids, self.config.text_config.eos_token_id]], dtype=tf.int32 + ) + + input_ids = tf.tile(input_ids, (batch_size, 1)) + + # PyTorch: input_ids[:, 0] = self.config.text_config.bos_token_id + input_ids = tf.concat( + [tf.ones((batch_size, 1), dtype=tf.int32) * self.config.text_config.bos_token_id, input_ids[:, 1:]], axis=1 + ) + attention_mask = attention_mask[:, :-1] if attention_mask is not None else None + + outputs = self.text_decoder.generate( + input_ids=input_ids[:, :-1], + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **generate_kwargs, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) + + +@add_start_docstrings( + """ + BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text + decoder. The vision encoder will encode the input image, the text encoder will encode the input question together + with the encoding of the image, and the text decoder will output the answer to the question. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForQuestionAnswering(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) + + self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder") + + self.decoder_pad_token_id = config.text_config.pad_token_id + self.decoder_start_token_id = config.text_config.bos_token_id + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right + def _shift_right(self, input_ids): + decoder_start_token_id = self.decoder_start_token_id + pad_token_id = self.decoder_pad_token_id + + if decoder_start_token_id is None or pad_token_id is None: + raise ValueError("decoder_start_token_id and pad_token_id must be defined!") + + start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.cast(start_tokens, input_ids.dtype) # Ensure compatible dtypes for concatenation + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype)) + + return shifted_input_ids + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig) + def call( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + labels: tf.Tensor | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipTextVisionModelOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering + + >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> # training + >>> text = "How many cats are in the picture?" + >>> label = "2" + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> labels = processor(text=label, return_tensors="tf").input_ids + + >>> inputs["labels"] = labels + >>> outputs = model(**inputs) + >>> loss = outputs.loss + + >>> # inference + >>> text = "How many cats are in the picture?" + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ```""" + if labels is None and decoder_input_ids is None: + raise ValueError( + "Either `decoder_input_ids` or `labels` should be passed when calling" + " `TFBlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you" + " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`" + ) + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64) + + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + training=training, + ) + + question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state + + if labels is not None and decoder_input_ids is None: + # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153 + decoder_input_ids = labels + + answer_output = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=question_embeds, + encoder_attention_mask=attention_mask, + labels=labels, + return_dict=return_dict, + training=training, + ) + + if labels is not None: + decoder_loss = tf.reduce_mean(answer_output.loss) if return_dict else tf.reduce_mean(answer_output[0]) + else: + decoder_loss = None + + if not return_dict: + outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return TFBlipTextVisionModelOutput( + loss=decoder_loss, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + def generate( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor, + attention_mask: tf.Tensor | None = None, + **generate_kwargs, + ) -> tf.Tensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`: + Input image to be processed + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for + tokens that are NOT MASKED, `0` for MASKED tokens. + generate_kwargs (dict, *optional*): + Additional arguments passed to the `generate` function of the decoder + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering + + >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "How many cats are in the picture?" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ``` + """ + vision_outputs = self.vision_model(pixel_values=pixel_values) + + image_embeds = vision_outputs[0] + + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32) + + if isinstance(input_ids, list): + input_ids = tf.Tensor(input_ids) + + question_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=False, + ) + + question_embeds = question_outputs[0] + + question_attention_mask = tf.ones(shape_list(question_embeds)[:-1], dtype=tf.int32) + + bos_ids = tf.fill( + (tf.shape(question_embeds)[0], 1), value=tf.cast(self.decoder_start_token_id, input_ids.dtype) + ) + + outputs = self.text_decoder.generate( + input_ids=bos_ids, + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + encoder_hidden_states=question_embeds, + encoder_attention_mask=question_attention_mask, + **generate_kwargs, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) + + +@add_start_docstrings( + """ + BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of + image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): + config_class = BlipConfig + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) + + # vision projection layer + self.vision_proj = keras.layers.Dense( + config.image_text_hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="vision_proj", + ) + + # text projection layer + self.text_proj = keras.layers.Dense( + config.image_text_hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="text_proj", + ) + + # image text matching head + self.itm_head = keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head" + ) + + self.decoder_pad_token_id = ( + config.text_config.pad_token_id + if not hasattr(config, "decoder_pad_token_id") + else config.decoder_pad_token_id + ) + self.decoder_start_token_id = ( + config.text_config.bos_token_id + if not hasattr(config, "decoder_start_token_id") + else config.decoder_start_token_id + ) + self.config = config + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig) + def call( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor | None = None, + use_itm_head: bool | None = True, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipImageTextMatchingModelOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForImageTextRetrieval + + >>> model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "an image of a cat" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> outputs = model(**inputs) + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + image_atts = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64) + + # Matt: In PyTorch, only one path (itm/non-itm) is taken. However, in TensorFlow this can result in + # some layers not being built! To avoid this, we always call both paths, then use an if statement to select + # which output to pass to the final output. The unnecessary nodes will be pruned from the final graph, but + # not before the layers have all been built correctly. + itm_question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=return_dict, + training=training, + ) + itm_question_embeds = itm_question_embeds[0] if not return_dict else itm_question_embeds.last_hidden_state + + itm_output = self.itm_head(itm_question_embeds[:, 0, :]) + + no_itm_question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + return_dict=return_dict, + training=training, + ) + no_itm_question_embeds = ( + no_itm_question_embeds[0] if not return_dict else no_itm_question_embeds.last_hidden_state + ) + + image_feat, _ = tf.linalg.normalize(self.vision_proj(image_embeds[:, 0, :]), ord=2, axis=-1) + text_feat, _ = tf.linalg.normalize(self.text_proj(no_itm_question_embeds[:, 0, :]), ord=2, axis=-1) + + no_itm_output = tf.matmul(image_feat, text_feat, transpose_b=True) + + if use_itm_head: + output = itm_output + question_embeds = itm_question_embeds + else: + output = no_itm_output + question_embeds = no_itm_question_embeds + + if not return_dict: + outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,) + return tuple(output for output in outputs if output is not None) + + return TFBlipImageTextMatchingModelOutput( + itm_score=output, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + question_embeds=question_embeds, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "vision_proj", None) is not None: + with tf.name_scope(self.vision_proj.name): + self.vision_proj.build([None, None, self.config.vision_config.hidden_size]) + if getattr(self, "text_proj", None) is not None: + with tf.name_scope(self.text_proj.name): + self.text_proj.build([None, None, self.config.text_config.hidden_size]) + if getattr(self, "itm_head", None) is not None: + with tf.name_scope(self.itm_head.name): + self.itm_head.build([None, None, self.config.text_config.hidden_size]) + + +__all__ = [ + "TFBlipModel", + "TFBlipPreTrainedModel", + "TFBlipForConditionalGeneration", + "TFBlipForQuestionAnswering", + "TFBlipVisionModel", + "TFBlipTextModel", + "TFBlipForImageTextRetrieval", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py new file mode 100644 index 0000000000000000000000000000000000000000..7dae1126e03b12b39b17b8fdc00b6050f9681fcd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py @@ -0,0 +1,1122 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the BSD-3-clause license (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import math + +import tensorflow as tf + +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, +) +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + get_initializer, + get_tf_activation, + keras, + keras_serializable, + shape_list, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax +from ...utils import add_start_docstrings_to_model_forward, logging +from .configuration_blip import BlipTextConfig + + +logger = logging.get_logger(__name__) + +BLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 +class TFBlipTextEmbeddings(keras.layers.Layer): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.word_embeddings = keras.layers.Embedding( + config.vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="word_embeddings", + ) + self.position_embeddings = keras.layers.Embedding( + config.max_position_embeddings, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="position_embeddings", + ) + + # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + + self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0, training=None): + if input_ids is not None: + input_shape = tf.shape(input_ids) + else: + input_shape = tf.shape(inputs_embeds)[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings, training=training) + return embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "word_embeddings", None) is not None: + with tf.name_scope(self.word_embeddings.name): + self.word_embeddings.build(None) + if getattr(self, "position_embeddings", None) is not None: + with tf.name_scope(self.position_embeddings.name): + self.position_embeddings.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 +class TFBlipTextSelfAttention(keras.layers.Layer): + def __init__(self, config, is_cross_attention, **kwargs): + super().__init__(**kwargs) + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = keras.layers.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.is_cross_attention = is_cross_attention + + def transpose_for_scores(self, x): + new_x_shape = tf.concat( + [tf.shape(x)[:-1], tf.constant([self.num_attention_heads, self.attention_head_size], dtype=tf.int32)], + axis=0, + ) + x = tf.reshape(x, new_x_shape) + return tf.transpose(x, perm=(0, 2, 1, 3)) + + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + training=None, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = shape_list(hidden_states)[1] + position_ids_l = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 1) + position_ids_r = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 0) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = tf.cast(positional_embedding, query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = tf.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function) + attention_scores = attention_scores + tf.cast(attention_mask, attention_scores.dtype) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = attention_probs_dropped @ value_layer + + context_layer = tf.transpose(context_layer, perm=(0, 2, 1, 3)) + new_context_layer_shape = shape_list(context_layer)[:-2] + [self.all_head_size] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if self.is_cross_attention: + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.encoder_hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.encoder_hidden_size]) + else: + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +class TFBlipTextSelfOutput(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool | None = None) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 +class TFBlipTextAttention(keras.layers.Layer): + def __init__(self, config, is_cross_attention=False, **kwargs): + super().__init__(**kwargs) + self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self") + # "output" is a protected attribute on TF models + self.self_output = TFBlipTextSelfOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + past_key_value: tuple[tuple[tf.Tensor]] | None = None, + output_attentions: bool | None = False, + training: bool | None = None, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + training=training, + ) + attention_output = self.self_output(self_outputs[0], hidden_states, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText +class TFBlipTextIntermediate(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFBlipTextOutput(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBlipTextLayer(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.attention = TFBlipTextAttention(config, name="attention") + if self.config.is_decoder: + self.crossattention = TFBlipTextAttention( + config, is_cross_attention=self.config.is_decoder, name="crossattention" + ) + self.intermediate = TFBlipTextIntermediate(config, name="intermediate") + self.self_output = TFBlipTextOutput(config, name="output") + + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + training=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + training=training, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + intermediate_output = self.intermediate(attention_output) + layer_output = self.self_output(intermediate_output, attention_output, training=training) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 +@keras_serializable +class TFBlipTextEncoder(keras.layers.Layer): + config_class = BlipTextConfig + + def __init__(self, config, name=None, **kwargs): + super().__init__(name=name, **kwargs) + self.config = config + self.layer = [TFBlipTextLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + @unpack_inputs + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + training=None, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.is_decoder else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + training=training, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText +class TFBlipTextPooler(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText +class TFBlipTextPredictionHeadTransform(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBlipTextLMPredictionHead(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.transform = TFBlipTextPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = keras.layers.Dense( + config.vocab_size, + kernel_initializer=get_initializer(config.initializer_range), + name="decoder", + use_bias=False, + ) + self.config = config + + def build(self, input_shape=None): + self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build([None, None, self.config.hidden_size]) + + def call(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class TFBlipTextOnlyMLMHead(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.predictions = TFBlipTextLMPredictionHead(config, name="predictions") + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 +class TFBlipTextPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipTextConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + +# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571 +class TFBlipTextModel(TFBlipTextPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an + `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True, name=None, **kwargs): + super().__init__(config, name=name, **kwargs) + self.config = config + + self.embeddings = TFBlipTextEmbeddings(config, name="embeddings") + self.encoder = TFBlipTextEncoder(config, name="encoder") + self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @tf.function + def get_extended_attention_mask( + self, attention_mask: tf.Tensor, input_shape: tuple[int], is_decoder: bool + ) -> tf.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`tf.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`tuple[int]`): + The shape of the input to the model. + is_decoder (`bool`): + Whether the model is used as a decoder. + + Returns: + `tf.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if not isinstance(attention_mask, tf.Tensor): + attention_mask = tf.convert_to_tensor(attention_mask) # Catches NumPy inputs that haven't been cast yet + if attention_mask.shape.rank == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.shape.rank == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = tf.range(seq_length, dtype=attention_mask.dtype) + causal_mask = tf.broadcast_to(seq_ids, (batch_size, seq_length, seq_length)) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + + if shape_list(causal_mask)[1] < shape_list(attention_mask)[1]: + prefix_seq_len = tf.shape(attention_mask)[1] - tf.shape(causal_mask)[1] + causal_mask = tf.concat( + [ + tf.ones((batch_size, seq_length, prefix_seq_len), dtype=causal_mask.dtype), + causal_mask, + ], + axis=-1, + ) + extended_attention_mask = ( + tf.cast(causal_mask[:, None, :, :], attention_mask.dtype) * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + encoder_embeds: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + past_key_values: tuple[tuple[tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + is_decoder: bool = False, + training: bool = False, + ) -> tuple[tf.Tensor] | TFBaseModelOutputWithPoolingAndCrossAttentions: + r""" + encoder_hidden_states (`tf.Tensor`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(tf.Tensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + batch_size, seq_length = input_shape + elif encoder_embeds is not None: + input_shape = shape_list(encoder_embeds)[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = tf.ones((batch_size, seq_length + past_key_values_length)) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: tf.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, is_decoder) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states[0]) + else: + encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states) + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = tf.ones(encoder_hidden_shape) + encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 +class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert") + self.cls = TFBlipTextOnlyMLMHead(config, name="cls") + self.label_smoothing = config.label_smoothing + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + training=None, + ): + r""" + encoder_hidden_states (`tf.Tensor`, *optional*): Sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is + configured as a decoder. + encoder_attention_mask (`tf.Tensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`tf.Tensor`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(tf.Tensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + training=training, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :] + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :] + shifted_prediction_scores = tf.reshape(shifted_prediction_scores, (-1, self.config.vocab_size)) + labels = labels[:, 1:] + labels = tf.reshape(labels, (-1,)) + # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here + # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) + one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) + loss_fct = keras.losses.CategoricalCrossentropy( + from_logits=True, label_smoothing=self.label_smoothing, reduction="none" + ) + masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) + lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) + lm_loss *= masked_positions + lm_loss = tf.reduce_sum(lm_loss, axis=0) / tf.math.count_nonzero(masked_positions, dtype=tf.float32) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states"), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask"), + "is_decoder": True, + } + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "cls", None) is not None: + with tf.name_scope(self.cls.name): + self.cls.build(None) + + +__all__ = ["TFBlipTextLMHeadModel", "TFBlipTextModel", "TFBlipTextPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..5cc4334a974cb8e47f1b94b180e1f5cf8f553513 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Blip. +""" + +from typing import Optional, Union + +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +class BlipProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": {}, + } + + +class BlipProcessor(ProcessorMixin): + r""" + Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor. + + [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the + docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. + + Args: + image_processor (`BlipImageProcessor`): + An instance of [`BlipImageProcessor`]. The image processor is a required input. + tokenizer (`BertTokenizerFast`): + An instance of ['BertTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + + def __init__(self, image_processor, tokenizer, **kwargs): + tokenizer.return_token_type_ids = False + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[BlipProcessorKwargs], + ) -> BatchEncoding: + """ + This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + Args: + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + """ + if images is None and text is None: + raise ValueError("You have to specify either images or text.") + + text_encoding = None + + # add pixel_values encoding. If we also have text_encoding, update image encoding and return it. + # else, return the text encoding. + output_kwargs = self._merge_kwargs( + BlipProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if text is not None: + text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) + if images is not None: + encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) + + if text_encoding is not None: + encoding_image_processor.update(text_encoding) + return encoding_image_processor + + return text_encoding + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + tokenizer_input_names = [name for name in tokenizer_input_names if name != "token_type_ids"] + return tokenizer_input_names + image_processor_input_names + + +__all__ = ["BlipProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..487753e8b5f9ec04ee46a482d14ab68d2e15361e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/configuration_blt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/configuration_blt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da1f18bb95f45f59b3bdf7cf2650e2a843480ca9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/configuration_blt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modeling_blt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modeling_blt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd7176f6a7ef98d9f4c99ae68bc0aef7ca8fbedb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modeling_blt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modular_blt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modular_blt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6fa7be1ac3d188d9e1cc635afb766427265df77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modular_blt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebdb9f103b8ae6cd4cdcf0aca0b7114911ecb583 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9494384cbe71eee86707b5ea58d422fafc5da1b2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7dca811edcb346affef57b976ecf9b74d52f912 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef7f761d85b4bf539f6d4464d6ac25a1278da5d6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01bc84d1cfd718d62d49cd8a208c78690542043f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbc4d01e2b4dcdb93b6499553e21722f794c81b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cb726942b0f16105f8a5a5f7c661485951d7ccc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_byt5 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__pycache__/tokenization_byt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__pycache__/tokenization_byt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cacbdb5774cd99a6d460705a47211970d216bbc9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__pycache__/tokenization_byt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py new file mode 100644 index 0000000000000000000000000000000000000000..2a9804db1014a963fb2054083f2db2782a41016d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py @@ -0,0 +1,236 @@ +# coding=utf-8 +# Copyright 2021 T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for model ByT5.""" + +import warnings +from typing import Optional + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ByT5Tokenizer(PreTrainedTokenizer): + """ + Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + extra_ids (`int`, *optional*, defaults to 125): + Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are + accessible as "" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are + indexed from the end of the vocabulary up to beginning ("" is the last token in the vocabulary + like in ByT5 preprocessing see + [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)). + additional_special_tokens (`list[str]`, *optional*): + Additional special tokens used by the tokenizer. + """ + + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + eos_token="", + unk_token="", + pad_token="", + extra_ids=125, + additional_special_tokens=None, + **kwargs, + ) -> None: + # Add extra_ids to the special token list + if extra_ids > 0 and additional_special_tokens is None: + additional_special_tokens = [f"" for i in range(extra_ids)] + elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0: + # Check that we have the right number of extra_id special tokens + extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))) + if extra_tokens != extra_ids: + raise ValueError( + f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are" + " provided to ByT5Tokenizer. In this case the additional_special_tokens must include the" + " extra_ids tokens" + ) + + pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token + # we force left and right stripping for backward compatibility. The byt5tests depend on this. + eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token + # unk token needs to be in the vocab with correct index + self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token} + self.offset = len(self._added_tokens_decoder) + self._utf_vocab_size = 2**8 # utf is 8 bits + super().__init__( + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + extra_ids=0, + additional_special_tokens=additional_special_tokens, # TODO extra ids are not used :sweatywmile: + **kwargs, + ) + + @property + def vocab_size(self): + return self._utf_vocab_size + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)} + vocab.update(self.added_tokens_encoder) + return vocab + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + # normal case: some special tokens + if token_ids_1 is None: + return ([0] * len(token_ids_0)) + [1] + return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]: + """Do not add eos again if user already added it.""" + if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id: + warnings.warn( + f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated" + " eos tokens being added." + ) + return token_ids + else: + return token_ids + [self.eos_token_id] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A sequence has the following format: + + - single sequence: `X ` + - pair of sequences: `A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + token_ids_0 = self._add_eos_if_not_present(token_ids_0) + if token_ids_1 is None: + return token_ids_0 + else: + token_ids_1 = self._add_eos_if_not_present(token_ids_1) + return token_ids_0 + token_ids_1 + + def _tokenize(self, text: str) -> list[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + tokens = [chr(i) for i in text.encode("utf-8")] + return tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + + if len(token) != 1: + token_id = None + else: + token_id = ord(token) + self.offset + + return token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = chr(index - self.offset) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + bstring = b"" + for token in tokens: + if token in self.added_tokens_decoder: + tok_string = self.added_tokens_decoder[token].encode("utf-8") + elif token in self.added_tokens_encoder: + tok_string = token.encode("utf-8") + else: + tok_string = bytes([ord(token)]) + bstring += tok_string + string = bstring.decode("utf-8", errors="ignore") + return string + + # ByT5Tokenizer has no vocab file + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + return () + + +__all__ = ["ByT5Tokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..033b8ecd7c630208602e1ae4d38b95df7718aaed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py @@ -0,0 +1,1169 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Chameleon model.""" + +from functools import cached_property +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_chameleon import ChameleonConfig, ChameleonVQVAEConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Chameleon +class ChameleonRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + ChameleonRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) +class ChameleonRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + super().__init__() + self.scaling_factor = scaling_factor + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + # For BC we register cos and sin cached + self.max_seq_len_cached = max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids): + # x: [bs, num_attention_heads, seq_len, head_size] + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): + """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def forward(self, x, position_ids): + # difference to the original RoPE: a scaling factor is applied to the position ids + position_ids = position_ids.float() / self.scaling_factor + cos, sin = super().forward(x, position_ids) + return cos, sin + + +class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding): + """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def forward(self, x, position_ids): + # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=x.device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation + + cos, sin = super().forward(x, position_ids) + return cos, sin + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Chameleon +class ChameleonMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + # Ignore copy + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class ChameleonLayerNorm(nn.LayerNorm): + """ + LayerNorm but computes stats only over the last dim because Chameleon applies gamma and beta + from each shard separately to each head, instead of reducing. We can apply each head's own + gamma/beta by repeat-interleaving weights from each shard, but the stats have to be computed + in the last dimension. This module applies gamma/beta manually to fulfill this requirement. + """ + + def __init__(self, hidden_size, *args, **kwargs): + super().__init__(hidden_size, *args, **kwargs) + self.normalized_shape = (hidden_size[-1],) + + def forward(self, hidden_states): + hidden_states = F.layer_norm(hidden_states, self.normalized_shape, None, None, eps=1e-5) + hidden_states = hidden_states * self.weight + self.bias + return hidden_states + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class ChameleonAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.model_parallel_size = config.model_parallel_size + self.scaling = self.head_dim**-0.5 + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) + self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim)) + self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim)) + self._init_rope() + + # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # TODO(joao): add me back asap :) + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = ChameleonRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = ChameleonLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = ChameleonDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.reshape(-1, self.num_heads, self.head_dim) + query_states = self.q_norm(query_states) + + key_states = key_states.reshape(-1, self.num_key_value_heads, self.head_dim) + key_states = self.k_norm(key_states) + + query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; position_ids needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +class ChameleonDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: ChameleonConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = ChameleonAttention(config=config, layer_idx=layer_idx) + + self.mlp = ChameleonMLP(config) + self.input_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_values (`Cache`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class ChameleonSwinDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: ChameleonConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = ChameleonAttention(config=config, layer_idx=layer_idx) + + self.mlp = ChameleonMLP(config) + self.input_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings + past_key_values (`Cache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + """ + + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = self.input_layernorm(hidden_states) + hidden_states = residual + hidden_states + # Fully Connected + residual = hidden_states + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class ChameleonVQVAEVectorQuantizer(nn.Module): + """ + A module for vector quantization using learned embedding vectors. + + This module implements the quantization process similar to te one described in + the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous + input vectors into discrete codebook vectors, which are learned during training. + Current implementation improves over previous ones by avoiding costly matrix multiplications + and allowing for post-hoc remapping of indices. + """ + + def __init__(self, config): + super().__init__() + self.num_embeddings = config.num_embeddings + self.embedding_dim = config.embed_dim + self.beta = getattr(config, "beta", 0.25) + + self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim) + + def forward(self, hidden_state: torch.Tensor): + hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous() + hidden_state_flattened = hidden_state.view(-1, self.embedding_dim) + + # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z + distances = ( + torch.sum(hidden_state_flattened**2, dim=1, keepdim=True) + + torch.sum(self.embedding.weight**2, dim=1) + - 2 * torch.einsum("bd,dn->bn", hidden_state_flattened, self.embedding.weight.transpose(0, 1)) + ) + + min_encoding_indices = torch.argmin(distances, dim=1) + hidden_state_quant = self.embedding(min_encoding_indices).view(hidden_state.shape) + + # compute loss for embedding + loss = torch.mean((hidden_state_quant.detach() - hidden_state) ** 2) + self.beta * torch.mean( + (hidden_state_quant - hidden_state.detach()) ** 2 + ) + + # preserve gradients + hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach() + + # reshape back to match original input shape + hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous() + + return hidden_state_quant, loss, min_encoding_indices + + +class ChameleonVQVAEEncoderConvDownsample(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, hidden_states): + # no asymmetric padding in torch conv, must do it ourselves + hidden_states = F.pad(hidden_states, pad=(0, 1, 0, 1), mode="constant", value=0) + hidden_states = self.conv(hidden_states) + return hidden_states + + +class ChameleonVQVAEEncoderResnetBlock(nn.Module): + def __init__( + self, + config, + in_channels, + out_channels=None, + conv_shortcut=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = in_channels if out_channels is None else out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True) + self.dropout = torch.nn.Dropout(config.dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, hidden_states): + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states *= torch.sigmoid(hidden_states) + hidden_states = self.conv1(hidden_states) + + hidden_states = self.norm2(hidden_states) + hidden_states *= torch.sigmoid(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + residual = self.conv_shortcut(residual) + else: + residual = self.nin_shortcut(residual) + + return residual + hidden_states + + +class ChameleonVQVAEEncoderAttnBlock(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, hidden_states): + residual = hidden_states + hidden_states = self.norm(hidden_states) + query_states = self.q(hidden_states) + key_states = self.k(hidden_states) + value_states = self.v(hidden_states) + + # compute attention + batch_size, channels, height, width = query_states.shape + query_states = query_states.reshape(batch_size, channels, height * width).permute(0, 2, 1) + key_states = key_states.reshape(batch_size, channels, height * width) + attn_weights = torch.bmm(query_states, key_states) + attn_weights = attn_weights * (int(channels) ** (-0.5)) + attn_weights = F.softmax(attn_weights, dim=2) + + # attend to values + value_states = value_states.reshape(batch_size, channels, height * width) + attn_weights = attn_weights.permute(0, 2, 1) + attn_output = torch.bmm(value_states, attn_weights).reshape(batch_size, channels, height, width) + + attn_output = self.proj_out(attn_output) + return residual + attn_output + + +class ChameleonVQVAEEncoder(nn.Module): + def __init__(self, config): + super().__init__() + + self.num_resolutions = len(config.channel_multiplier) + self.num_res_blocks = config.num_res_blocks + base_channels = config.base_channels + resolution = config.resolution + in_channels = config.in_channels + double_latent = config.double_latent + latent_channels = config.latent_channels + channel_multiplier = config.channel_multiplier + + self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_channel_multiplier = (1,) + tuple(channel_multiplier) + self.in_channel_multiplier = in_channel_multiplier + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = base_channels * in_channel_multiplier[i_level] + block_out = base_channels * channel_multiplier[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ChameleonVQVAEEncoderResnetBlock( + config=config, + in_channels=block_in, + out_channels=block_out, + ) + ) + block_in = block_out + if ( + config.attn_resolutions is not None + and curr_res in config.attn_resolutions + and config.attn_type == "vanilla" + ): + attn.append(ChameleonVQVAEEncoderAttnBlock(block_in)) + + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = ChameleonVQVAEEncoderConvDownsample(block_in) + curr_res = curr_res // 2 + self.down.append(down) + + self.mid = nn.Module() + self.mid.block_1 = ChameleonVQVAEEncoderResnetBlock( + config=config, + in_channels=block_in, + out_channels=block_in, + ) + self.mid.attn_1 = ChameleonVQVAEEncoderAttnBlock(block_in) if config.attn_type == "vanilla" else nn.Identity() + self.mid.block_2 = ChameleonVQVAEEncoderResnetBlock( + config=config, + in_channels=block_in, + out_channels=block_in, + ) + + self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True) + self.conv_out = torch.nn.Conv2d( + block_in, + 2 * latent_channels if double_latent else latent_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + def forward(self, pixel_values: torch.LongTensor): + # downsampling + hidden_states = [self.conv_in(pixel_values)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + hidden_state = self.down[i_level].block[i_block]( + hidden_states[-1], + ) + if len(self.down[i_level].attn) > 0: + hidden_state = self.down[i_level].attn[i_block](hidden_state) + hidden_states.append(hidden_state) + if i_level != self.num_resolutions - 1: + hidden_states.append(self.down[i_level].downsample(hidden_states[-1])) + + # middle + last_hidden_state = hidden_states[-1] + last_hidden_state = self.mid.block_1(last_hidden_state) + last_hidden_state = self.mid.attn_1(last_hidden_state) + last_hidden_state = self.mid.block_2(last_hidden_state) + + # end + last_hidden_state = self.norm_out(last_hidden_state) + last_hidden_state *= torch.sigmoid(last_hidden_state) + last_hidden_state = self.conv_out(last_hidden_state) + return last_hidden_state + + +class ChameleonImageVocabularyMapping: + """ + A class for mapping discrete image tokens from VQGAN to BPE tokens. + """ + + def __init__(self, vocab_map): + self.vocab_map = vocab_map + self.image_token_id = vocab_map.get("") + + @cached_property + def val2name(self): + return {v: k for k, v in self.vocab_map.items()} + + @cached_property + def image_tokens(self): + return sorted([val for name, val in self.vocab_map.items() if name.startswith("IMGIMG")]) + + @cached_property + def bpe2img(self): + img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)} + + def remap(old_name: str) -> str: + return "".join(img_tkn_chr_mapping.get(c, c) for c in old_name[len("IMGIMG") : -1]) + + return {tok: int(remap(self.val2name[tok])) for tok in self.image_tokens} + + @cached_property + def img2bpe(self): + return {v: k for k, v in self.bpe2img.items()} + + @cached_property + def bpe2img_search_tensors(self): + return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor(sorted(self.bpe2img.values())) + + @cached_property + def img2bpe_mapping_tensor(self): + mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int) + for k, v in self.img2bpe.items(): + mapping[k] = v + return mapping + + def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor: + device = img_batch.device + img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")] + return img_tokens.to(device) + + +@auto_docstring +class ChameleonPreTrainedModel(PreTrainedModel): + config: ChameleonConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["ChameleonDecoderLayer", "ChameleonSwinDecoderLayer"] + _skip_keys_device_placement = ["past_key_values", "causal_mask"] + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_param_buffer_assignment = False + _supports_flex_attn = True + _supports_attention_backend = True + + +@auto_docstring( + custom_intro=""" + The VQ-VAE model used in Chameleon for encoding/decoding images into discrete tokens. + This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from + [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv + Taigman](https://huggingface.co/papers/2203.13131). + """ +) +class ChameleonVQVAE(ChameleonPreTrainedModel): + config: ChameleonVQVAEConfig + _no_split_modules = [ + "ChameleonVQVAEVectorQuantizer", + "ChameleonVQVAEEncoderAttnBlock", + "ChameleonVQVAEEncoderResnetBlock", + ] + + def __init__(self, config: ChameleonVQVAEConfig): + super().__init__(config) + + self.encoder = ChameleonVQVAEEncoder(config) + self.quantize = ChameleonVQVAEVectorQuantizer(config) + self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1) + self.eval() # Chameleon's VQ model is frozen + + def encode(self, pixel_values: torch.LongTensor): + hidden_states = self.encoder(pixel_values) + hidden_states = self.quant_conv(hidden_states) + quant, emb_loss, indices = self.quantize(hidden_states) + return quant, emb_loss, indices + + +@auto_docstring +class ChameleonModel(ChameleonPreTrainedModel): + def __init__(self, config: ChameleonConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.vocabulary_mapping = ChameleonImageVocabularyMapping(config.vocabulary_map) + decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm else ChameleonSwinDecoderLayer + self.layers = nn.ModuleList( + [decoder_layer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.vqmodel = ChameleonVQVAE._from_config(config.vq_config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_image_tokens(self, pixel_values: torch.FloatTensor): + """ + Tokenizes images into discrete tokens with VQGAN module. Converts + obtained image tokens into BPE tokens and wraps with "boi" and "eoi" + special tokens. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. + """ + batch_size = pixel_values.shape[0] + _, _, image_toks = self.vqmodel.encode(pixel_values) + bpe_toks = self.vocabulary_mapping.convert_img2bpe(image_toks) + bpe_toks = bpe_toks.view(batch_size, -1) + return bpe_toks + + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Tokenizes images into discrete tokens with VQGAN module and embeds + them with text embeddings layer + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. + """ + image_tokens = self.get_image_tokens(pixel_values) + vision_embeddings = self.get_input_embeddings()(image_tokens) + return vision_embeddings + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.vocabulary_mapping.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.vocabulary_mapping.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if pixel_values is not None: + image_embeds = self.get_image_features(pixel_values) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_embeds) + + # torch.jit.trace() doesn't support cache objects in the output + if use_cache and past_key_values is None and not torch.jit.is_tracing(): + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns] if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@auto_docstring( + custom_intro=""" + Chameleon Model with a head on top used for outputting logits for next token prediction. + """ +) +class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = ChameleonModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_image_tokens(self, pixel_values): + return self.model.get_image_tokens(pixel_values) + + def get_image_features(self, pixel_values): + return self.model.get_image_features(pixel_values) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import ChameleonProcessor, ChameleonForConditionalGeneration + >>> import torch + >>> import requests + >>> from PIL import Image + + >>> model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", dtype=torch.bfloat16) + >>> processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") + + >>> prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation." + >>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw) + >>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw) + + >>> inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.bfloat16) + + >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False) + >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + # Disallow image tokens which does not include special begin-image and end-image tokens + image_tokens = self.model.vocabulary_mapping.image_tokens + logits[:, :, image_tokens] = torch.finfo(logits.dtype).min + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + pixel_values=None, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + pixel_values=pixel_values, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + **kwargs, + ) + + if cache_position[0] != 0: + # If we're in cached decoding stage, pixel values should be `None` because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = None + + return model_inputs + + +__all__ = ["ChameleonForConditionalGeneration", "ChameleonModel", "ChameleonPreTrainedModel", "ChameleonVQVAE"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53ea73dd0c14060b758f811234b95b6c484a2269 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/configuration_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/configuration_clap.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab3660bd5ead026f9b64e59553876e133243eb70 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/configuration_clap.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/feature_extraction_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/feature_extraction_clap.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a1a8121c8ad755e05175cd05f6b4def439928c2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/feature_extraction_clap.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97b11a07240e804b7240af0df0f62bf37b507752 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/processing_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/processing_clap.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a3c756c8259fdf8d16362f72c9c65b802219c53 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/processing_clap.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9018656c0b8baaa34509b62abe2b204d26bd747d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/configuration_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/configuration_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db643e9620cbc4380f66e0c1728b268c4971dfbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/configuration_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/feature_extraction_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/feature_extraction_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c39dc4c96b17f50052cedfe257b7987362cf937 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/feature_extraction_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd45b775ad5c444c5ea88899ffc4fce6e3e7a7cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdf46cb18a085c0f48976aa8bf0a1a5cba958fa3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83d5ca52ec303ce8132f5c72c6fed62280d8beee Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_flax_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_flax_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79fa5c2722fb9218861b565f61dcb2580f5441e3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_flax_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_tf_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_tf_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d037fc30d4402d1a77ef8e2ef516d054255e48ef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_tf_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/processing_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/processing_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8b311ef842d560873f7a6367eeb8aaedcac583f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/processing_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4dcc41cce811f9ffd560f4dc2439e38c79e7e869 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae1b9d1b2e48b998018e77ddc42099ab5fd4ecb5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a96d64532a85a58a4d09c4d83fac10ce2db0dc1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3be80c07d48e13d7308a1e3701e30b4738d1ad2c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c388077540c517bdf11bf6acb68803a55c808e8e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/processing_clipseg.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/processing_clipseg.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bee811d085c8b0e0859833887b73bd14c79917f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/processing_clipseg.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78be2bc4895f94929da48f07c8329a5e8179864d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/configuration_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/configuration_clvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1461f2cec4b4f07d1de1789b8f632d09b9f867b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/configuration_clvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/feature_extraction_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/feature_extraction_clvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52555c678fd89d898b6fdba5bab40190d3e2b201 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/feature_extraction_clvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/modeling_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/modeling_clvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84c80c68b2da0001190e235152724c2b8033b83d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/modeling_clvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/number_normalizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/number_normalizer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49b524dd8765fb5ec4ffdebd4368c7f9b735d65e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/number_normalizer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/processing_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/processing_clvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0329091cb51053fc9d57ebe3b548999160290f31 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/processing_clvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/tokenization_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/tokenization_clvp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbc118fb933c1182fe2bdc3834f42dbb2222e52a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/tokenization_clvp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea2d9af11150f556b2cedfb78271f174256e64b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_codegen import * + from .modeling_codegen import * + from .tokenization_codegen import * + from .tokenization_codegen_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/configuration_codegen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/configuration_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..6a9ab842710cdd67911305ce324fe4c68dec173b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/configuration_codegen.py @@ -0,0 +1,231 @@ +# coding=utf-8 +# Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CodeGen model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import Any, Optional + +from ... import PreTrainedTokenizer, TensorType, is_torch_available +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfigWithPast, PatchingSpec +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CodeGenConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CodeGenModel`]. It is used to instantiate a + CodeGen model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the CodeGen + [Salesforce/codegen-2B-mono](https://huggingface.co/Salesforce/codegen-2B-mono) architecture. Configuration objects + inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 50400): + Vocabulary size of the CodeGen model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CodeGenModel`]. + n_positions (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_ctx (`int`, *optional*, defaults to 2048): + This attribute is used in `CodeGenModel.__init__` without any real effect. + n_embd (`int`, *optional*, defaults to 4096): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 28): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + rotary_dim (`int`, *optional*, defaults to 64): + Number of dimensions in the embedding that Rotary Position Embedding is applied to. + n_inner (`int`, *optional*): + Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd + activation_function (`str`, *optional*, defaults to `"gelu_new"`): + Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. + resid_pdrop (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.0): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + bos_token_id (`int`, *optional*, defaults to 50256): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 50256): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the + model has a output word embedding layer. + + Example: + + ```python + >>> from transformers import CodeGenConfig, CodeGenModel + + >>> # Initializing a CodeGen 6B configuration + >>> configuration = CodeGenConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = CodeGenModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "codegen" + attribute_map = { + "max_position_embeddings": "n_positions", + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=50400, + n_positions=2048, + n_ctx=2048, + n_embd=4096, + n_layer=28, + n_head=16, + rotary_dim=64, + n_inner=None, + activation_function="gelu_new", + resid_pdrop=0.0, + embd_pdrop=0.0, + attn_pdrop=0.0, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + tie_word_embeddings=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.rotary_dim = rotary_dim + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__( + bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs + ) + + +# Copied from transformers.models.gpt2.configuration_gpt2.GPT2OnnxConfig +class CodeGenOnnxConfig(OnnxConfigWithPast): + def __init__( + self, + config: PretrainedConfig, + task: str = "default", + patching_specs: Optional[list[PatchingSpec]] = None, + use_past: bool = False, + ): + super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past) + if not getattr(self._config, "pad_token_id", None): + # TODO: how to do that better? + self._config.pad_token_id = 0 + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}}) + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"} + else: + common_inputs["attention_mask"] = {0: "batch", 1: "sequence"} + + return common_inputs + + @property + def num_layers(self) -> int: + return self._config.n_layer + + @property + def num_attention_heads(self) -> int: + return self._config.n_head + + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + # We need to order the input in the way they appears in the forward() + ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]}) + + # Need to add the past_keys + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + past_shape = ( + batch, + self.num_attention_heads, + past_key_values_length, + self._config.hidden_size // self.num_attention_heads, + ) + ordered_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers) + ] + + ordered_inputs["attention_mask"] = common_inputs["attention_mask"] + if self.use_past: + mask_dtype = ordered_inputs["attention_mask"].dtype + ordered_inputs["attention_mask"] = torch.cat( + [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + + return ordered_inputs + + @property + def default_onnx_opset(self) -> int: + return 13 + + +__all__ = ["CodeGenConfig", "CodeGenOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..887b400b479929c947ed2dfeadc436294033ad8a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.py @@ -0,0 +1,668 @@ +# coding=utf-8 +# Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CodeGen model.""" + +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import PreTrainedModel +from ...utils import ( + auto_docstring, + is_torch_flex_attn_available, + logging, +) +from .configuration_codegen import CodeGenConfig + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions +def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor: + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim)) + sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq).float() + return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1) + + +# Copied from transformers.models.gptj.modeling_gptj.rotate_every_two +def rotate_every_two(x: torch.Tensor) -> torch.Tensor: + x1 = x[:, :, :, ::2] + x2 = x[:, :, :, 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) # in einsum notation: rearrange(x, '... d j -> ... (d j)') + + +# Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb +def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor: + sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3) + cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3) + return (tensor * cos) + (rotate_every_two(tensor) * sin) + + +class CodeGenAttention(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + + max_positions = config.max_position_embeddings + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.embed_dim = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_attention_heads + if self.head_dim * self.num_attention_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and" + f" `num_attention_heads`: {self.num_attention_heads})." + ) + self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()) + self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False) + + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.rotary_dim = config.rotary_dim + pos_embd_dim = self.rotary_dim or self.embed_dim + self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim) + + def _split_heads(self, x, n_head, dim_head, mp_num): + reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head)) + reshaped = reshaped.reshape(x.shape[:-2] + (-1,) + reshaped.shape[-1:]) + return reshaped + + def _merge_heads(self, tensor, num_attention_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into n_ctx + """ + if len(tensor.shape) == 5: + tensor = tensor.permute(0, 1, 3, 2, 4).contiguous() + elif len(tensor.shape) == 4: + tensor = tensor.permute(0, 2, 1, 3).contiguous() + else: + raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}") + new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,) + return tensor.view(new_shape) + + def _attn( + self, + query, + key, + value, + attention_mask=None, + head_mask=None, + ): + # Keep the attention weights computation in fp32 to avoid overflow issues + query = query.to(torch.float32) + key = key.to(torch.float32) + + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights += causal_mask + + attn_weights = attn_weights / self.scale_attn + attn_weights = nn.Softmax(dim=-1)(attn_weights) + attn_weights = attn_weights.to(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def forward( + self, + hidden_states: Optional[torch.FloatTensor], + layer_past: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[ + tuple[torch.Tensor, tuple[torch.Tensor]], + Optional[tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]]], + ]: + qkv = self.qkv_proj(hidden_states) + # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic + mp_num = 4 + qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1)) + + local_dim = self.head_dim * self.num_attention_heads // mp_num + query, value, key = torch.split(qkv_split, local_dim, dim=-1) + query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num) + key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num) + + value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num) + value = value.permute(0, 2, 1, 3) + + embed_positions = self.embed_positions + if embed_positions.device != position_ids.device: + embed_positions = embed_positions.to(position_ids.device) + self.embed_positions = embed_positions + + sincos = embed_positions[position_ids] + sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1) + + if self.rotary_dim is not None: + k_rot = key[:, :, :, : self.rotary_dim] + k_pass = key[:, :, :, self.rotary_dim :] + + q_rot = query[:, :, :, : self.rotary_dim] + q_pass = query[:, :, :, self.rotary_dim :] + + k_rot = apply_rotary_pos_emb(k_rot, sin, cos) + q_rot = apply_rotary_pos_emb(q_rot, sin, cos) + + key = torch.cat([k_rot, k_pass], dim=-1) + query = torch.cat([q_rot, q_pass], dim=-1) + else: + key = apply_rotary_pos_emb(key, sin, cos) + query = apply_rotary_pos_emb(query, sin, cos) + + key = key.permute(0, 2, 1, 3) + query = query.permute(0, 2, 1, 3) + + # Note that this cast is quite ugly, but is not implemented before ROPE as k_rot in the original codebase is always in fp32. + # Reference: https://github.com/salesforce/CodeGen/blob/f210c3bb1216c975ad858cd4132c0fdeabf4bfc2/codegen1/jaxformer/hf/codegen/modeling_codegen.py#L38 + if layer_past is not None: + cache_kwargs = { + "sin": sin, + "cos": cos, + "partial_rotation_size": self.rotary_dim, + "cache_position": cache_position, + } + key, value = layer_past.update(key.to(hidden_states.dtype), value, self.layer_idx, cache_kwargs) + + # compute self-attention: V x Softmax(QK^T) + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim) + attn_output = self.out_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + return attn_output, attn_weights + + +# Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->CodeGen +class CodeGenMLP(nn.Module): + def __init__(self, intermediate_size, config): # in MLP: intermediate_size= 4 * embed_dim + super().__init__() + embed_dim = config.n_embd + + self.fc_in = nn.Linear(embed_dim, intermediate_size) + self.fc_out = nn.Linear(intermediate_size, embed_dim) + + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor: + hidden_states = self.fc_in(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.fc_out(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->CodeGen +class CodeGenBlock(GradientCheckpointingLayer): + # Ignore copy + def __init__(self, config, layer_idx=None): + super().__init__() + inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd + self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + self.attn = CodeGenAttention(config, layer_idx) + self.mlp = CodeGenMLP(inner_dim, config) + + def forward( + self, + hidden_states: Optional[torch.FloatTensor], + layer_past: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_outputs, attn_weights = self.attn( + hidden_states=hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + feed_forward_hidden_states = self.mlp(hidden_states) + hidden_states = attn_outputs + feed_forward_hidden_states + residual + + return hidden_states, attn_weights + + +@auto_docstring +class CodeGenPreTrainedModel(PreTrainedModel): + config: CodeGenConfig + base_model_prefix = "transformer" + supports_gradient_checkpointing = True + _no_split_modules = ["CodeGenBlock"] + _skip_keys_device_placement = "past_key_values" + + _can_compile_fullgraph = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear,)): + # Slightly different from Mesh Transformer JAX which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class CodeGenModel(CodeGenPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embed_dim = config.n_embd + self.vocab_size = config.vocab_size + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([CodeGenBlock(config, layer_idx=i) for i in range(config.n_layer)]) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads) + + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[tuple, BaseModelOutputWithPast]: + r""" + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + seq_length = inputs_embeds.shape[1] + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x num_attention_heads x N x N + # head_mask has shape n_layer x batch x num_attention_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + hidden_states = inputs_embeds + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, seq_length) + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + output_shape = (-1, seq_length, hidden_states.size(-1)) + + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + for i, block in enumerate(self.h): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + layer_past=past_key_values, + attention_mask=causal_mask, + position_ids=position_ids, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Union[torch.Tensor, "BlockMask"], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool = False, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + return attention_mask + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +@auto_docstring( + custom_intro=""" + The CodeGen Model transformer with a language modeling head on top. + """ +) +class CodeGenForCausalLM(CodeGenPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = CodeGenModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = transformer_outputs[0] + + # make sure sampling in fp16 works correctly and + # compute loss in fp32 to match with mesh-tf version + # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179 + lm_logits = self.lm_head(hidden_states).to(torch.float32) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(lm_logits.device) + # Flatten the tokens + loss = self.loss_function( + lm_logits, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + loss = loss.to(hidden_states.dtype) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = ["CodeGenForCausalLM", "CodeGenModel", "CodeGenPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..152b1a84fc37d5ed6613072284133565ebc86cbf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen.py @@ -0,0 +1,390 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for CodeGen""" + +import json +import os +from functools import lru_cache +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np +import regex as re + +from ...utils import is_tf_available, is_torch_available, logging, to_py_obj + + +if TYPE_CHECKING: + if is_torch_available(): + import torch + if is_tf_available(): + import tensorflow as tf + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +@lru_cache +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class CodeGenTokenizer(PreTrainedTokenizer): + """ + Construct a CodeGen tokenizer. Based on byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import CodeGenTokenizer + + >>> tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono") + >>> tokenizer("Hello world")["input_ids"] + [15496, 995] + + >>> tokenizer(" Hello world")["input_ids"] + [18435, 995] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*): + The token used for padding, for example when batching sequences of different lengths. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (CodeGen tokenizer detect beginning of words by the preceding space). + add_bos_token (`bool`, *optional*, defaults to `False`): + Whether to add a beginning of sequence token at the start of sequences. + return_token_type_ids (`bool`, *optional*, defaults to `False`): + Whether to return token type IDs. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + pad_token=None, + add_prefix_space=False, + add_bos_token=False, + return_token_type_ids=False, + **kwargs, + ): + bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token + self.add_bos_token = add_bos_token + self.return_token_type_ids = return_token_type_ids + if self.return_token_type_ids: + self.model_input_names.append("token_type_ids") + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + add_bos_token=add_bos_token, + return_token_type_ids=return_token_type_ids, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is None: + return output + + return output + bos_token_ids + token_ids_1 + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if is_split_into_words or add_prefix_space: + text = " " + text + return (text, kwargs) + + def decode( + self, + token_ids: Union[int, list[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + truncate_before_pattern: Optional[list[str]] = None, + **kwargs, + ) -> str: + """ + Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special + tokens and clean up tokenization spaces. + + Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). + truncate_before_pattern (`List[str]`, *optional*, defaults to `None`): + A list of regular expression strings that will be used to truncate the returned string. This can be + used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning + of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + + Returns: + `str`: The decoded sentence. + """ + + token_ids = to_py_obj(token_ids) + + decoded_text = super()._decode( + token_ids=token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + if truncate_before_pattern is not None and len(truncate_before_pattern) > 0: + decoded_text = self.truncate(decoded_text, truncate_before_pattern) + + return decoded_text + + def truncate(self, completion, truncate_before_pattern): + def find_re(string, pattern, start_pos): + m = pattern.search(string, start_pos) + return m.start() if m else -1 + + terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern] + + prints = list(re.finditer("^print", completion, re.MULTILINE)) + + if len(prints) > 1: + completion = completion[: prints[1].start()] + + defs = list(re.finditer("^def", completion, re.MULTILINE)) + + if len(defs) > 1: + completion = completion[: defs[1].start()] + + start_pos = 0 + + terminals_pos = [ + pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1 + ] + + if len(terminals_pos) > 0: + return completion[: min(terminals_pos)] + else: + return completion + + +__all__ = ["CodeGenTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..7bac0db7de4e7c548ddea0eb2b3c498919c6196a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen_fast.py @@ -0,0 +1,235 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +import re +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np + +from ...utils import is_tf_available, is_torch_available, logging + + +if TYPE_CHECKING: + if is_torch_available(): + import torch + if is_tf_available(): + import tensorflow as tf + + +from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from .tokenization_codegen import CodeGenTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + + +class CodeGenTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" CodeGen tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import CodeGenTokenizerFast + + >>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono") + >>> tokenizer("Hello world")["input_ids"] + [15496, 995] + + >>> tokenizer(" Hello world")["input_ids"] + [18435, 995] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (CodeGen tokenizer detect beginning of words by the preceding space). + return_token_type_ids (`bool`, *optional*, defaults to `False`): + Whether to return token type IDs. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = CodeGenTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + add_prefix_space=False, + return_token_type_ids=False, + **kwargs, + ): + self.return_token_type_ids = return_token_type_ids + if self.return_token_type_ids: + self.model_input_names.append("token_type_ids") + + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + add_prefix_space=add_prefix_space, + return_token_type_ids=return_token_type_ids, + **kwargs, + ) + + if kwargs.pop("add_bos_token", False): + model_id = kwargs.pop("name_or_path", "") + raise ValueError( + "Currently GPT2's fast tokenizer does NOT support adding a BOS token. " + "Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n" + f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n" + f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n" + "This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005." + " so that the fast tokenizer works correctly." + ) + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + def decode( + self, + token_ids: Union[int, list[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + truncate_before_pattern: Optional[list[str]] = None, + **kwargs, + ) -> str: + """ + Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special + tokens and clean up tokenization spaces. + + Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). + truncate_before_pattern (`List[str]`, *optional*, defaults to `None`): + A list of regular expression strings that will be used to truncate the returned string. This can be + used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning + of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + + Returns: + `str`: The decoded sentence. + """ + + decoded_text = super().decode( + token_ids=token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + if truncate_before_pattern is not None and len(truncate_before_pattern) > 0: + decoded_text = self.truncate(decoded_text, truncate_before_pattern) + + return decoded_text + + def truncate(self, completion, truncate_before_pattern): + def find_re(string, pattern, start_pos): + m = pattern.search(string, start_pos) + return m.start() if m else -1 + + terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern] + + prints = list(re.finditer("^print", completion, re.MULTILINE)) + + if len(prints) > 1: + completion = completion[: prints[1].start()] + + defs = list(re.finditer("^def", completion, re.MULTILINE)) + + if len(defs) > 1: + completion = completion[: defs[1].start()] + + start_pos = 0 + + terminals_pos = [ + pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1 + ] + + if len(terminals_pos) > 0: + return completion[: min(terminals_pos)] + else: + return completion + + +__all__ = ["CodeGenTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f68cfedd1c15ebd4b3f19fb1b461f39edc144523 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modeling_cohere2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modeling_cohere2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31a61473ffdb1841669126ccc01a88c0e7328afd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modeling_cohere2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modular_cohere2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modular_cohere2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4016c539f7c6cf5556e923e20b5108931f1898dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modular_cohere2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89bbcf782e83f833c2cff63d7b31135ae0c3cd5b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/configuration_convnextv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/configuration_convnextv2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1b2f2beee45764600d85c28ba7de1e79a0c9658 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/configuration_convnextv2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_convnextv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_convnextv2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..468e500e3a573ba2e1d94534c1255eda540ccf18 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_convnextv2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_tf_convnextv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_tf_convnextv2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23c7cef0bf690248567453eb67e86b9c99a4ace8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_tf_convnextv2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31fc0d8ebef4685bcba991f44a465f0106e08637 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f4b6b062611bff4756a5a70288215257ae06ea9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21da0752c3d8ab55eaeedc3988247190561c4a4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a187bc3d44b0ff684a52fca5c66d1f42a8e4142b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f3204d44aa3c10ddd0a15130ecfea23814f945f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d025117a0b314521ca8cf3500f757718873e1936 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea62163babef934550a87c17577891614b071642 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_ctrl import * + from .modeling_ctrl import * + from .modeling_tf_ctrl import * + from .tokenization_ctrl import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..7a812f0b55650fab8ab9a4d1b4c6bad4cd9446f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py @@ -0,0 +1,116 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Salesforce CTRL configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CTRLConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`CTRLModel`] or a [`TFCTRLModel`]. It is used to + instantiate a CTRL model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the + [Salesforce/ctrl](https://huggingface.co/Salesforce/ctrl) architecture from SalesForce. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 246534): + Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CTRLModel`] or [`TFCTRLModel`]. + n_positions (`int`, *optional*, defaults to 256): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_embd (`int`, *optional*, defaults to 1280): + Dimensionality of the embeddings and hidden states. + dff (`int`, *optional*, defaults to 8192): + Dimensionality of the inner dimension of the feed forward networks (FFN). + n_layer (`int`, *optional*, defaults to 48): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-06): + The epsilon to use in the layer normalization layers + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + + + Examples: + + ```python + >>> from transformers import CTRLConfig, CTRLModel + + >>> # Initializing a CTRL configuration + >>> configuration = CTRLConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = CTRLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ctrl" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "max_position_embeddings": "n_positions", + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=246534, + n_positions=256, + n_embd=1280, + dff=8192, + n_layer=48, + n_head=16, + resid_pdrop=0.1, + embd_pdrop=0.1, + layer_norm_epsilon=1e-6, + initializer_range=0.02, + use_cache=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.dff = dff + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + + self.use_cache = use_cache + + super().__init__(**kwargs) + + +__all__ = ["CTRLConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..506bed039b17d4dadfce2755366207f3a2e718f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py @@ -0,0 +1,787 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CTRL model.""" + +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + auto_docstring, + logging, +) +from .configuration_ctrl import CTRLConfig + + +logger = logging.get_logger(__name__) + + +def angle_defn(pos, i, d_model_size): + angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size) + return pos * angle_rates + + +def positional_encoding(position, d_model_size, dtype): + # create the sinusoidal pattern for the positional encoding + angle_rads = angle_defn( + torch.arange(position, dtype=torch.int64).to(dtype).unsqueeze(1), + torch.arange(d_model_size, dtype=torch.int64).to(dtype).unsqueeze(0), + d_model_size, + ) + + sines = torch.sin(angle_rads[:, 0::2]) + cosines = torch.cos(angle_rads[:, 1::2]) + + pos_encoding = torch.cat([sines, cosines], dim=-1) + return pos_encoding + + +def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): + # calculate attention + matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2)) + + dk = k.shape[-1] + scaled_attention_logits = matmul_qk / np.sqrt(dk) + + if mask is not None: + nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1) + scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4 + + if attention_mask is not None: + # Apply the attention mask + scaled_attention_logits = scaled_attention_logits + attention_mask + + attention_weights = torch.softmax(scaled_attention_logits, dim=-1) + + # Mask heads if we want to + if head_mask is not None: + attention_weights = attention_weights * head_mask + + output = torch.matmul(attention_weights, v) + + return output, attention_weights + + +class MultiHeadAttention(nn.Module): + def __init__(self, d_model_size, num_heads, layer_idx=None): + super().__init__() + self.num_heads = num_heads + self.d_model_size = d_model_size + self.layer_idx = layer_idx + + self.depth = int(d_model_size / self.num_heads) + + self.Wq = nn.Linear(d_model_size, d_model_size) + self.Wk = nn.Linear(d_model_size, d_model_size) + self.Wv = nn.Linear(d_model_size, d_model_size) + + self.dense = nn.Linear(d_model_size, d_model_size) + self.pruned_heads = set() + + def prune_heads(self, heads): + attention_head_size = self.d_model_size // self.num_heads + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, attention_head_size, self.pruned_heads) + + # Prune linear layers + self.Wq = prune_linear_layer(self.Wq, index) + self.Wk = prune_linear_layer(self.Wk, index) + self.Wv = prune_linear_layer(self.Wv, index) + self.dense = prune_linear_layer(self.dense, index, dim=1) + + # Update hyper params + self.num_heads = self.num_heads - len(heads) + self.d_model_size = attention_head_size * self.num_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def split_into_heads(self, x, batch_size): + x = x.reshape(batch_size, -1, self.num_heads, self.depth) + return x.permute([0, 2, 1, 3]) + + def forward( + self, + v, + k, + q, + mask, + layer_past=None, + attention_mask=None, + head_mask=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + batch_size = q.shape[0] + + q = self.Wq(q) + k = self.Wk(k) + v = self.Wv(v) + + q = self.split_into_heads(q, batch_size) + k = self.split_into_heads(k, batch_size) + v = self.split_into_heads(v, batch_size) + + if layer_past is not None: + k, v = layer_past.update(k, v, self.layer_idx, {"cache_position": cache_position}) + + output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) + scaled_attention = output[0].permute([0, 2, 1, 3]) + attn = output[1] + original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size) + output = self.dense(original_size_attention) + return output, attn + + +def point_wise_feed_forward_network(d_model_size, dff): + return nn.Sequential(nn.Linear(d_model_size, dff), nn.ReLU(), nn.Linear(dff, d_model_size)) + + +class EncoderLayer(nn.Module): + def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_idx=None): + super().__init__() + + self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, layer_idx=layer_idx) + self.ffn = point_wise_feed_forward_network(d_model_size, dff) + + self.layernorm1 = nn.LayerNorm(d_model_size, eps=1e-6) + self.layernorm2 = nn.LayerNorm(d_model_size, eps=1e-6) + + self.dropout1 = nn.Dropout(rate) + self.dropout2 = nn.Dropout(rate) + + def forward( + self, + x, + mask, + layer_past=None, + attention_mask=None, + head_mask=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + normed = self.layernorm1(x) + attn_outputs = self.multi_head_attention( + normed, + normed, + normed, + mask, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn_output = attn_outputs[0] + attn_output = self.dropout1(attn_output) + out1 = x + attn_output + + out2 = self.layernorm2(out1) + ffn_output = self.ffn(out2) + ffn_output = self.dropout2(ffn_output) + out2 = out1 + ffn_output + + outputs = (out2,) + attn_outputs[1:] + return outputs + + +@auto_docstring +class CTRLPreTrainedModel(PreTrainedModel): + config: CTRLConfig + base_model_prefix = "transformer" + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class CTRLModel(CTRLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.d_model_size = config.n_embd + self.num_layers = config.n_layer + + self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float) + + self.w = nn.Embedding(config.vocab_size, config.n_embd) + + self.dropout = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList( + [ + EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, layer_idx=i) + for i in range(config.n_layer) + ] + ) + self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.w + + def set_input_embeddings(self, new_embeddings): + self.w = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].multi_head_attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + + Example: + + ```python + >>> from transformers import AutoTokenizer, CTRLModel + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLModel.from_pretrained("Salesforce/ctrl") + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 5, 1280] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `DynamicCache` instead, e.g. " + "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if position_ids is None: + position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0) + + # Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + token_type_embeds = self.w(token_type_ids) + token_type_embeds *= np.sqrt(self.d_model_size) + else: + token_type_embeds = 0 + + if inputs_embeds is None: + inputs_embeds = self.w(input_ids) + # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded + seq_len = input_shape[-1] + mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(device) + + inputs_embeds *= np.sqrt(self.d_model_size) + + # `self.pos_encoding` won't be sent to the correct device along the model, so we do it manually. + self.pos_encoding = self.pos_encoding.to(device) + pos_embeds = self.pos_encoding[position_ids, :] + + hidden_states = inputs_embeds + pos_embeds + token_type_embeds + + hidden_states = self.dropout(hidden_states) + + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, h in enumerate(self.h): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + outputs = h( + hidden_states, + mask, + layer_past=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = outputs[0] + if output_attentions: + all_attentions += (outputs[1],) + + hidden_states = self.layernorm(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_attentions] if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class CTRLLMHeadModel(CTRLPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = CTRLModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CTRLLMHeadModel + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl") + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> sequence_ids = model.generate(inputs["input_ids"]) + >>> sequences = tokenizer.batch_decode(sequence_ids) + >>> sequences + ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,'] + + >>> outputs = model(**inputs, labels=inputs["input_ids"]) + >>> round(outputs.loss.item(), 2) + 9.21 + + >>> list(outputs.logits.shape) + [1, 5, 246534] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + loss = self.loss_function( + lm_logits, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs): + # Overwritten -- inputs_embeds not working properly + + # only last tokens for inputs_ids if past is defined in kwargs + if past_key_values is not None: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache} + + # token_type_ids are computed on CTRLModel.forward() + kwargs.pop("token_type_ids", None) + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + print(f"Warning: {key} is not a recognized input.") + model_inputs[key] = value + + return model_inputs + + +@auto_docstring( + custom_intro=""" + The CTRL Model transformer with a sequence classification head on top (linear layer). + [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last + token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in + each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot + guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last + value in each row of the batch). + """ +) +class CTRLForSequenceClassification(CTRLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = CTRLModel(config) + self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Example of single-label classification: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CTRLForSequenceClassification + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl") + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + + >>> predicted_class_id = logits.argmax().item() + >>> model.config.id2label[predicted_class_id] + 'LABEL_0' + ``` + + ```python + >>> import torch + + >>> torch.manual_seed(42) # doctest: +IGNORE_RESULT + >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` + >>> num_labels = len(model.config.id2label) + >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels) + + >>> labels = torch.tensor(1) + >>> loss = model(**inputs, labels=labels).loss + >>> round(loss.item(), 2) + 0.93 + ``` + + Example of multi-label classification: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CTRLForSequenceClassification + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLForSequenceClassification.from_pretrained( + ... "Salesforce/ctrl", problem_type="multi_label_classification" + ... ) + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + + >>> predicted_class_id = logits.argmax().item() + >>> model.config.id2label[predicted_class_id] + 'LABEL_0' + ``` + + ```python + >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` + >>> num_labels = len(model.config.id2label) + >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels) + + >>> num_labels = len(model.config.id2label) + >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to( + ... torch.float + ... ) + >>> loss = model(**inputs, labels=labels).loss + >>> loss.backward() # doctest: +IGNORE_RESULT + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + logits = self.classifier(hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) + token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=pooled_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = ["CTRLForSequenceClassification", "CTRLLMHeadModel", "CTRLModel", "CTRLPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..1dce90147bd854b5072410eb6d6e74f73c0a9fd0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py @@ -0,0 +1,920 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 CTRL model.""" + +from __future__ import annotations + +import numpy as np +import tensorflow as tf + +from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast, TFSequenceClassifierOutput +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_ctrl import CTRLConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "Salesforce/ctrl" +_CONFIG_FOR_DOC = "CTRLConfig" + + +def angle_defn(pos, i, d_model_size): + angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model_size) + return pos * angle_rates + + +def positional_encoding(position, d_model_size): + # create the sinusoidal pattern for the positional encoding + angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size) + + sines = np.sin(angle_rads[:, 0::2]) + cosines = np.cos(angle_rads[:, 1::2]) + pos_encoding = tf.convert_to_tensor(np.concatenate([sines, cosines], axis=-1)) + + return pos_encoding + + +def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): + # calculate attention + matmul_qk = tf.matmul(q, k, transpose_b=True) + + dk = tf.cast(shape_list(k)[-1], dtype=matmul_qk.dtype) + scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) + + if mask is not None: + scaled_attention_logits += tf.cast(mask * -1e4, dtype=scaled_attention_logits.dtype) + + if attention_mask is not None: + # Apply the attention mask + attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype) + scaled_attention_logits = scaled_attention_logits + attention_mask + + attention_weights = stable_softmax(scaled_attention_logits, axis=-1) + + # Mask heads if we want to + if head_mask is not None: + attention_weights = attention_weights * head_mask + + output = tf.matmul(attention_weights, v) + + return output, attention_weights + + +class TFMultiHeadAttention(keras.layers.Layer): + def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): + super().__init__(**kwargs) + self.num_heads = num_heads + self.d_model_size = d_model_size + self.output_attentions = output_attentions + + self.depth = int(d_model_size / self.num_heads) + + self.Wq = keras.layers.Dense(d_model_size, name="Wq") + self.Wk = keras.layers.Dense(d_model_size, name="Wk") + self.Wv = keras.layers.Dense(d_model_size, name="Wv") + + self.dense = keras.layers.Dense(d_model_size, name="dense") + + def split_into_heads(self, x, batch_size): + x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): + batch_size = shape_list(q)[0] + + q = self.Wq(q) + k = self.Wk(k) + v = self.Wv(v) + + q = self.split_into_heads(q, batch_size) + k = self.split_into_heads(k, batch_size) + v = self.split_into_heads(v, batch_size) + + if layer_past is not None: + past_key, past_value = tf.unstack(layer_past, axis=0) + k = tf.concat((past_key, k), axis=-2) + v = tf.concat((past_value, v), axis=-2) + + if use_cache: + present = tf.stack((k, v), axis=0) + else: + present = (None,) + + output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) + scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3]) + attn = output[1] + original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size)) + output = self.dense(original_size_attention) + outputs = (output, present) + + if output_attentions: + outputs = outputs + (attn,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "Wq", None) is not None: + with tf.name_scope(self.Wq.name): + self.Wq.build([None, None, self.d_model_size]) + if getattr(self, "Wk", None) is not None: + with tf.name_scope(self.Wk.name): + self.Wk.build([None, None, self.d_model_size]) + if getattr(self, "Wv", None) is not None: + with tf.name_scope(self.Wv.name): + self.Wv.build([None, None, self.d_model_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.d_model_size]) + + +class TFPointWiseFeedForwardLayer(keras.layers.Layer): + def __init__(self, d_model_size, dff, **kwargs): + super().__init__(**kwargs) + + self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0") + self.dense_2 = keras.layers.Dense(d_model_size, name="2") + self.d_model_size = d_model_size + self.dff = dff + + def call(self, inputs, trainable=False): + dense_0_output = self.dense_0(inputs) + dense_2_output = self.dense_2(dense_0_output) + + return dense_2_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense_0", None) is not None: + with tf.name_scope(self.dense_0.name): + self.dense_0.build([None, None, self.d_model_size]) + if getattr(self, "dense_2", None) is not None: + with tf.name_scope(self.dense_2.name): + self.dense_2.build([None, None, self.dff]) + + +class TFEncoderLayer(keras.layers.Layer): + def __init__( + self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs + ): + super().__init__(**kwargs) + + self.output_attentions = output_attentions + + self.multi_head_attention = TFMultiHeadAttention( + d_model_size, num_heads, output_attentions=self.output_attentions, name="multi_head_attention" + ) + self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn") + + self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") + self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") + + self.dropout1 = keras.layers.Dropout(rate) + self.dropout2 = keras.layers.Dropout(rate) + self.d_model_size = d_model_size + + def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): + normed = self.layernorm1(x) + attn_outputs = self.multi_head_attention( + normed, + normed, + normed, + mask, + layer_past, + attention_mask, + head_mask, + use_cache, + output_attentions, + training=training, + ) + attn_output = attn_outputs[0] + attn_output = self.dropout1(attn_output, training=training) + out1 = x + attn_output + + out2 = self.layernorm2(out1) + ffn_output = self.ffn(out2) + ffn_output = self.dropout2(ffn_output, training=training) + out2 = out1 + ffn_output + + outputs = (out2,) + attn_outputs[1:] + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "multi_head_attention", None) is not None: + with tf.name_scope(self.multi_head_attention.name): + self.multi_head_attention.build(None) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(None) + if getattr(self, "layernorm1", None) is not None: + with tf.name_scope(self.layernorm1.name): + self.layernorm1.build([None, None, self.d_model_size]) + if getattr(self, "layernorm2", None) is not None: + with tf.name_scope(self.layernorm2.name): + self.layernorm2.build([None, None, self.d_model_size]) + + +@keras_serializable +class TFCTRLMainLayer(keras.layers.Layer): + config_class = CTRLConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions + self.use_cache = config.use_cache + self.return_dict = config.use_return_dict + + self.d_model_size = config.n_embd + self.num_layers = config.n_layer + + self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size) + + self.w = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.n_embd, + embeddings_initializer=get_initializer(config.initializer_range), + name="w", + ) + + self.dropout = keras.layers.Dropout(config.embd_pdrop) + self.h = [ + TFEncoderLayer( + config.n_embd, + config.n_head, + config.dff, + config.resid_pdrop, + config.layer_norm_epsilon, + self.output_attentions, + name=f"h_._{i}", + ) + for i in range(config.n_layer) + ] + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") + + def get_input_embeddings(self): + return self.w + + def set_input_embeddings(self, new_embeddings): + self.w = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutputWithPast: + # If using past key value states, only the last tokens + # should be given as an input + if past_key_values is not None: + if input_ids is not None: + input_ids = input_ids[:, -1:] + if inputs_embeds is not None: + inputs_embeds = inputs_embeds[:, -1:] + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1:] + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if past_key_values is None: + past_length = 0 + past_key_values = [None] * len(self.h) + else: + past_length = shape_list(past_key_values[0][0])[-2] + if position_ids is None: + position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32), axis=0) + position_ids = tf.tile(position_ids, [input_shape[0], 1]) + + # Attention mask. + if attention_mask is not None: + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1] + past_length)) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + + one_cst = tf.constant(1.0) + ten_thousand_cst = tf.constant(-10000.0) + attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype) + attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), ten_thousand_cst) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.num_layers + + if token_type_ids is not None: + token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) + token_type_embeds = self.w(token_type_ids) + token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, dtype=token_type_embeds.dtype)) + else: + token_type_embeds = tf.constant(0.0) + position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.w.input_dim) + inputs_embeds = self.w(input_ids) + seq_len = input_shape[-1] + mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) + + inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, inputs_embeds.dtype)) + + pos_embeds = tf.gather(self.pos_encoding, position_ids) + pos_embeds = tf.cast(pos_embeds, dtype=token_type_embeds.dtype) + hidden_states = inputs_embeds + pos_embeds + token_type_embeds + + hidden_states = self.dropout(hidden_states, training=training) + + output_shape = input_shape + [shape_list(hidden_states)[-1]] + presents = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, (h, layer_past) in enumerate(zip(self.h, past_key_values)): + if output_hidden_states: + all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) + outputs = h( + hidden_states, + mask, + layer_past, + attention_mask, + head_mask[i], + use_cache, + output_attentions, + training=training, + ) + hidden_states, present = outputs[:2] + + if use_cache: + presents = presents + (present,) + + if output_attentions: + all_attentions = all_attentions + (outputs[2],) + + hidden_states = self.layernorm(hidden_states) + hidden_states = tf.reshape(hidden_states, output_shape) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if output_attentions: + # let the number of heads free (-1) so we can extract attention even after head pruning + attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] + all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "w", None) is not None: + with tf.name_scope(self.w.name): + self.w.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.n_embd]) + if getattr(self, "h", None) is not None: + for layer in self.h: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFCTRLPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CTRLConfig + base_model_prefix = "transformer" + + +CTRL_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`CTRLConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CTRL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]` (`sequence_length` of + input past key value states). + + Indices of input sequence tokens in the vocabulary. + + If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + past (`list[tf.Tensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see + `past` output below). Can be used to speed up sequential decoding. The token ids which have their past + given to this model should not be passed as input ids as they have already been computed. + attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past` key value states are returned and can be used to speed up decoding (see `past`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", + CTRL_START_DOCSTRING, +) +class TFCTRLModel(TFCTRLPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFCTRLMainLayer(config, name="transformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutputWithPast: + outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +class TFCTRLBiasLayer(keras.layers.Layer): + """ + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, + so all weights have to be registered in a layer. + """ + + def __init__(self, shape, initializer, trainable, name, **kwargs): + super().__init__(name=name, **kwargs) + self.shape = shape + self.initializer = initializer + self.trainable = trainable + + def build(self, input_shape): + self.bias = self.add_weight( + name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable + ) + super().build(input_shape) + + def call(self, x): + return x + self.bias + + +@add_start_docstrings( + """ + The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + CTRL_START_DOCSTRING, +) +class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFCTRLMainLayer(config, name="transformer") + self.bias_layer = TFCTRLBiasLayer( + name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True + ) + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def get_bias(self): + return {"lm_head.bias": self.bias_layer.bias} + + def set_bias(self, value): + # Replaces the existing layers containing bias for correct (de)serialization. + vocab_size = value["lm_head.bias"].shape[-1] + self.bias_layer = TFCTRLBiasLayer( + name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True + ) + self.bias_layer.build(None) + self.bias_layer.bias.assign(value["lm_head.bias"]) + + # Copied from transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs): + token_type_ids = kwargs.get("token_type_ids") + # only last token for inputs_ids if past is defined in kwargs + if past_key_values: + inputs = tf.expand_dims(inputs[:, -1], -1) + if token_type_ids is not None: + token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1) + + position_ids = kwargs.get("position_ids") + attention_mask = kwargs.get("attention_mask") + + if attention_mask is not None and position_ids is None: + position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True) + if past_key_values: + position_ids = tf.expand_dims(position_ids[:, -1], -1) + + return { + "input_ids": inputs, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "token_type_ids": token_type_ids, + } + + @unpack_inputs + @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFCausalLMOutputWithPast: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = transformer_outputs[0] + logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True) + logits = self.bias_layer(logits) + + loss = None + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels, shifted_logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) + + +@add_start_docstrings( + """ + The CTRL Model transformer with a sequence classification head on top (linear layer). + + [`TFCTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1, GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + CTRL_START_DOCSTRING, +) +class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.classifier = keras.layers.Dense( + config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + use_bias=False, + ) + self.transformer = TFCTRLMainLayer(config, name="transformer") + self.config = config + + def get_output_embeddings(self): + # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too. + logger.warning( + "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed " + "in transformers v4.32." + ) + return self.transformer.w + + @unpack_inputs + @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFSequenceClassifierOutput: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + + transformer_outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = transformer_outputs[0] + logits = self.classifier(hidden_states) + logits_shape = shape_list(logits) + batch_size = logits_shape[0] + + if self.config.pad_token_id is None: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + else: + if input_ids is not None: + token_indices = tf.range(shape_list(input_ids)[-1]) + non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype) + last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1) + else: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + loss = None + + pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1) + + if labels is not None: + if self.config.pad_token_id is None and logits_shape[0] != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + + loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels])) + + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=pooled_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.n_embd]) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +__all__ = ["TFCTRLForSequenceClassification", "TFCTRLLMHeadModel", "TFCTRLModel", "TFCTRLPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..756aded9e6ad2964ffa5add89af2c4af56071e88 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_cvt import * + from .modeling_cvt import * + from .modeling_tf_cvt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1493ccb6e1ec2e3ee7b6276c321e046d639707bf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d18d603511f707943e62c9ee01827891550a06b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdb38ce24c0e75e6ca4df0b7749b71a18120d003 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf1bb68399f58b5df4bf4f4cecf196b37977158a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/configuration_cvt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/configuration_cvt.py new file mode 100644 index 0000000000000000000000000000000000000000..ba3f6b339627d868f79e4c53a022fed117166074 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/configuration_cvt.py @@ -0,0 +1,146 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CvT model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CvtConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CvtModel`]. It is used to instantiate a CvT model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the CvT + [microsoft/cvt-13](https://huggingface.co/microsoft/cvt-13) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3]`): + The kernel size of each encoder's patch embedding. + patch_stride (`list[int]`, *optional*, defaults to `[4, 2, 2]`): + The stride size of each encoder's patch embedding. + patch_padding (`list[int]`, *optional*, defaults to `[2, 1, 1]`): + The padding size of each encoder's patch embedding. + embed_dim (`list[int]`, *optional*, defaults to `[64, 192, 384]`): + Dimension of each of the encoder blocks. + num_heads (`list[int]`, *optional*, defaults to `[1, 3, 6]`): + Number of attention heads for each attention layer in each block of the Transformer encoder. + depth (`list[int]`, *optional*, defaults to `[1, 2, 10]`): + The number of layers in each encoder block. + mlp_ratios (`list[float]`, *optional*, defaults to `[4.0, 4.0, 4.0, 4.0]`): + Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the + encoder blocks. + attention_drop_rate (`list[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`): + The dropout ratio for the attention probabilities. + drop_rate (`list[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`): + The dropout ratio for the patch embeddings probabilities. + drop_path_rate (`list[float]`, *optional*, defaults to `[0.0, 0.0, 0.1]`): + The dropout probability for stochastic depth, used in the blocks of the Transformer encoder. + qkv_bias (`list[bool]`, *optional*, defaults to `[True, True, True]`): + The bias bool for query, key and value in attentions + cls_token (`list[bool]`, *optional*, defaults to `[False, False, True]`): + Whether or not to add a classification token to the output of each of the last 3 stages. + qkv_projection_method (`list[string]`, *optional*, defaults to ["dw_bn", "dw_bn", "dw_bn"]`): + The projection method for query, key and value Default is depth-wise convolutions with batch norm. For + Linear projection use "avg". + kernel_qkv (`list[int]`, *optional*, defaults to `[3, 3, 3]`): + The kernel size for query, key and value in attention layer + padding_kv (`list[int]`, *optional*, defaults to `[1, 1, 1]`): + The padding size for key and value in attention layer + stride_kv (`list[int]`, *optional*, defaults to `[2, 2, 2]`): + The stride size for key and value in attention layer + padding_q (`list[int]`, *optional*, defaults to `[1, 1, 1]`): + The padding size for query in attention layer + stride_q (`list[int]`, *optional*, defaults to `[1, 1, 1]`): + The stride size for query in attention layer + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-6): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import CvtConfig, CvtModel + + >>> # Initializing a Cvt msft/cvt style configuration + >>> configuration = CvtConfig() + + >>> # Initializing a model (with random weights) from the msft/cvt style configuration + >>> model = CvtModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "cvt" + + def __init__( + self, + num_channels=3, + patch_sizes=[7, 3, 3], + patch_stride=[4, 2, 2], + patch_padding=[2, 1, 1], + embed_dim=[64, 192, 384], + num_heads=[1, 3, 6], + depth=[1, 2, 10], + mlp_ratio=[4.0, 4.0, 4.0], + attention_drop_rate=[0.0, 0.0, 0.0], + drop_rate=[0.0, 0.0, 0.0], + drop_path_rate=[0.0, 0.0, 0.1], + qkv_bias=[True, True, True], + cls_token=[False, False, True], + qkv_projection_method=["dw_bn", "dw_bn", "dw_bn"], + kernel_qkv=[3, 3, 3], + padding_kv=[1, 1, 1], + stride_kv=[2, 2, 2], + padding_q=[1, 1, 1], + stride_q=[1, 1, 1], + initializer_range=0.02, + layer_norm_eps=1e-12, + **kwargs, + ): + super().__init__(**kwargs) + self.num_channels = num_channels + self.patch_sizes = patch_sizes + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.embed_dim = embed_dim + self.num_heads = num_heads + self.depth = depth + self.mlp_ratio = mlp_ratio + self.attention_drop_rate = attention_drop_rate + self.drop_rate = drop_rate + self.drop_path_rate = drop_path_rate + self.qkv_bias = qkv_bias + self.cls_token = cls_token + self.qkv_projection_method = qkv_projection_method + self.kernel_qkv = kernel_qkv + self.padding_kv = padding_kv + self.stride_kv = stride_kv + self.padding_q = padding_q + self.stride_q = stride_q + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + +__all__ = ["CvtConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_cvt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_cvt.py new file mode 100644 index 0000000000000000000000000000000000000000..9d935ee84893e05e925fde0ab2fd80397c7144e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_cvt.py @@ -0,0 +1,670 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CvT model.""" + +import collections.abc +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging +from .configuration_cvt import CvtConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for model's outputs, with potential hidden states and attentions. + """ +) +class BaseModelOutputWithCLSToken(ModelOutput): + r""" + cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`): + Classification token at the output of the last layer of the model. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + cls_token_value: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath +class CvtDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class CvtEmbeddings(nn.Module): + """ + Construct the CvT embeddings. + """ + + def __init__(self, patch_size, num_channels, embed_dim, stride, padding, dropout_rate): + super().__init__() + self.convolution_embeddings = CvtConvEmbeddings( + patch_size=patch_size, num_channels=num_channels, embed_dim=embed_dim, stride=stride, padding=padding + ) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, pixel_values): + hidden_state = self.convolution_embeddings(pixel_values) + hidden_state = self.dropout(hidden_state) + return hidden_state + + +class CvtConvEmbeddings(nn.Module): + """ + Image to Conv Embedding. + """ + + def __init__(self, patch_size, num_channels, embed_dim, stride, padding): + super().__init__() + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + self.patch_size = patch_size + self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=stride, padding=padding) + self.normalization = nn.LayerNorm(embed_dim) + + def forward(self, pixel_values): + pixel_values = self.projection(pixel_values) + batch_size, num_channels, height, width = pixel_values.shape + hidden_size = height * width + # rearrange "b c h w -> b (h w) c" + pixel_values = pixel_values.view(batch_size, num_channels, hidden_size).permute(0, 2, 1) + if self.normalization: + pixel_values = self.normalization(pixel_values) + # rearrange "b (h w) c" -> b c h w" + pixel_values = pixel_values.permute(0, 2, 1).view(batch_size, num_channels, height, width) + return pixel_values + + +class CvtSelfAttentionConvProjection(nn.Module): + def __init__(self, embed_dim, kernel_size, padding, stride): + super().__init__() + self.convolution = nn.Conv2d( + embed_dim, + embed_dim, + kernel_size=kernel_size, + padding=padding, + stride=stride, + bias=False, + groups=embed_dim, + ) + self.normalization = nn.BatchNorm2d(embed_dim) + + def forward(self, hidden_state): + hidden_state = self.convolution(hidden_state) + hidden_state = self.normalization(hidden_state) + return hidden_state + + +class CvtSelfAttentionLinearProjection(nn.Module): + def forward(self, hidden_state): + batch_size, num_channels, height, width = hidden_state.shape + hidden_size = height * width + # rearrange " b c h w -> b (h w) c" + hidden_state = hidden_state.view(batch_size, num_channels, hidden_size).permute(0, 2, 1) + return hidden_state + + +class CvtSelfAttentionProjection(nn.Module): + def __init__(self, embed_dim, kernel_size, padding, stride, projection_method="dw_bn"): + super().__init__() + if projection_method == "dw_bn": + self.convolution_projection = CvtSelfAttentionConvProjection(embed_dim, kernel_size, padding, stride) + self.linear_projection = CvtSelfAttentionLinearProjection() + + def forward(self, hidden_state): + hidden_state = self.convolution_projection(hidden_state) + hidden_state = self.linear_projection(hidden_state) + return hidden_state + + +class CvtSelfAttention(nn.Module): + def __init__( + self, + num_heads, + embed_dim, + kernel_size, + padding_q, + padding_kv, + stride_q, + stride_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + with_cls_token=True, + **kwargs, + ): + super().__init__() + self.scale = embed_dim**-0.5 + self.with_cls_token = with_cls_token + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.convolution_projection_query = CvtSelfAttentionProjection( + embed_dim, + kernel_size, + padding_q, + stride_q, + projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method, + ) + self.convolution_projection_key = CvtSelfAttentionProjection( + embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method + ) + self.convolution_projection_value = CvtSelfAttentionProjection( + embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method + ) + + self.projection_query = nn.Linear(embed_dim, embed_dim, bias=qkv_bias) + self.projection_key = nn.Linear(embed_dim, embed_dim, bias=qkv_bias) + self.projection_value = nn.Linear(embed_dim, embed_dim, bias=qkv_bias) + + self.dropout = nn.Dropout(attention_drop_rate) + + def rearrange_for_multi_head_attention(self, hidden_state): + batch_size, hidden_size, _ = hidden_state.shape + head_dim = self.embed_dim // self.num_heads + # rearrange 'b t (h d) -> b h t d' + return hidden_state.view(batch_size, hidden_size, self.num_heads, head_dim).permute(0, 2, 1, 3) + + def forward(self, hidden_state, height, width): + if self.with_cls_token: + cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1) + batch_size, hidden_size, num_channels = hidden_state.shape + # rearrange "b (h w) c -> b c h w" + hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width) + + key = self.convolution_projection_key(hidden_state) + query = self.convolution_projection_query(hidden_state) + value = self.convolution_projection_value(hidden_state) + + if self.with_cls_token: + query = torch.cat((cls_token, query), dim=1) + key = torch.cat((cls_token, key), dim=1) + value = torch.cat((cls_token, value), dim=1) + + head_dim = self.embed_dim // self.num_heads + + query = self.rearrange_for_multi_head_attention(self.projection_query(query)) + key = self.rearrange_for_multi_head_attention(self.projection_key(key)) + value = self.rearrange_for_multi_head_attention(self.projection_value(value)) + + attention_score = torch.einsum("bhlk,bhtk->bhlt", [query, key]) * self.scale + attention_probs = torch.nn.functional.softmax(attention_score, dim=-1) + attention_probs = self.dropout(attention_probs) + + context = torch.einsum("bhlt,bhtv->bhlv", [attention_probs, value]) + # rearrange"b h t d -> b t (h d)" + _, _, hidden_size, _ = context.shape + context = context.permute(0, 2, 1, 3).contiguous().view(batch_size, hidden_size, self.num_heads * head_dim) + return context + + +class CvtSelfOutput(nn.Module): + """ + The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, embed_dim, drop_rate): + super().__init__() + self.dense = nn.Linear(embed_dim, embed_dim) + self.dropout = nn.Dropout(drop_rate) + + def forward(self, hidden_state, input_tensor): + hidden_state = self.dense(hidden_state) + hidden_state = self.dropout(hidden_state) + return hidden_state + + +class CvtAttention(nn.Module): + def __init__( + self, + num_heads, + embed_dim, + kernel_size, + padding_q, + padding_kv, + stride_q, + stride_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + drop_rate, + with_cls_token=True, + ): + super().__init__() + self.attention = CvtSelfAttention( + num_heads, + embed_dim, + kernel_size, + padding_q, + padding_kv, + stride_q, + stride_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + with_cls_token, + ) + self.output = CvtSelfOutput(embed_dim, drop_rate) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_state, height, width): + self_output = self.attention(hidden_state, height, width) + attention_output = self.output(self_output, hidden_state) + return attention_output + + +class CvtIntermediate(nn.Module): + def __init__(self, embed_dim, mlp_ratio): + super().__init__() + self.dense = nn.Linear(embed_dim, int(embed_dim * mlp_ratio)) + self.activation = nn.GELU() + + def forward(self, hidden_state): + hidden_state = self.dense(hidden_state) + hidden_state = self.activation(hidden_state) + return hidden_state + + +class CvtOutput(nn.Module): + def __init__(self, embed_dim, mlp_ratio, drop_rate): + super().__init__() + self.dense = nn.Linear(int(embed_dim * mlp_ratio), embed_dim) + self.dropout = nn.Dropout(drop_rate) + + def forward(self, hidden_state, input_tensor): + hidden_state = self.dense(hidden_state) + hidden_state = self.dropout(hidden_state) + hidden_state = hidden_state + input_tensor + return hidden_state + + +class CvtLayer(nn.Module): + """ + CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps). + """ + + def __init__( + self, + num_heads, + embed_dim, + kernel_size, + padding_q, + padding_kv, + stride_q, + stride_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + drop_rate, + mlp_ratio, + drop_path_rate, + with_cls_token=True, + ): + super().__init__() + self.attention = CvtAttention( + num_heads, + embed_dim, + kernel_size, + padding_q, + padding_kv, + stride_q, + stride_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + drop_rate, + with_cls_token, + ) + + self.intermediate = CvtIntermediate(embed_dim, mlp_ratio) + self.output = CvtOutput(embed_dim, mlp_ratio, drop_rate) + self.drop_path = CvtDropPath(drop_prob=drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_before = nn.LayerNorm(embed_dim) + self.layernorm_after = nn.LayerNorm(embed_dim) + + def forward(self, hidden_state, height, width): + self_attention_output = self.attention( + self.layernorm_before(hidden_state), # in Cvt, layernorm is applied before self-attention + height, + width, + ) + attention_output = self_attention_output + attention_output = self.drop_path(attention_output) + + # first residual connection + hidden_state = attention_output + hidden_state + + # in Cvt, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_state) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_state) + layer_output = self.drop_path(layer_output) + return layer_output + + +class CvtStage(nn.Module): + def __init__(self, config, stage): + super().__init__() + self.config = config + self.stage = stage + if self.config.cls_token[self.stage]: + self.cls_token = nn.Parameter(torch.randn(1, 1, self.config.embed_dim[-1])) + + self.embedding = CvtEmbeddings( + patch_size=config.patch_sizes[self.stage], + stride=config.patch_stride[self.stage], + num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1], + embed_dim=config.embed_dim[self.stage], + padding=config.patch_padding[self.stage], + dropout_rate=config.drop_rate[self.stage], + ) + + drop_path_rates = [ + x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage], device="cpu") + ] + + self.layers = nn.Sequential( + *[ + CvtLayer( + num_heads=config.num_heads[self.stage], + embed_dim=config.embed_dim[self.stage], + kernel_size=config.kernel_qkv[self.stage], + padding_q=config.padding_q[self.stage], + padding_kv=config.padding_kv[self.stage], + stride_kv=config.stride_kv[self.stage], + stride_q=config.stride_q[self.stage], + qkv_projection_method=config.qkv_projection_method[self.stage], + qkv_bias=config.qkv_bias[self.stage], + attention_drop_rate=config.attention_drop_rate[self.stage], + drop_rate=config.drop_rate[self.stage], + drop_path_rate=drop_path_rates[self.stage], + mlp_ratio=config.mlp_ratio[self.stage], + with_cls_token=config.cls_token[self.stage], + ) + for _ in range(config.depth[self.stage]) + ] + ) + + def forward(self, hidden_state): + cls_token = None + hidden_state = self.embedding(hidden_state) + batch_size, num_channels, height, width = hidden_state.shape + # rearrange b c h w -> b (h w) c" + hidden_state = hidden_state.view(batch_size, num_channels, height * width).permute(0, 2, 1) + if self.config.cls_token[self.stage]: + cls_token = self.cls_token.expand(batch_size, -1, -1) + hidden_state = torch.cat((cls_token, hidden_state), dim=1) + + for layer in self.layers: + layer_outputs = layer(hidden_state, height, width) + hidden_state = layer_outputs + + if self.config.cls_token[self.stage]: + cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1) + hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width) + return hidden_state, cls_token + + +class CvtEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.stages = nn.ModuleList([]) + for stage_idx in range(len(config.depth)): + self.stages.append(CvtStage(config, stage_idx)) + + def forward(self, pixel_values, output_hidden_states=False, return_dict=True): + all_hidden_states = () if output_hidden_states else None + hidden_state = pixel_values + + cls_token = None + for _, (stage_module) in enumerate(self.stages): + hidden_state, cls_token = stage_module(hidden_state) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) + + return BaseModelOutputWithCLSToken( + last_hidden_state=hidden_state, + cls_token_value=cls_token, + hidden_states=all_hidden_states, + ) + + +@auto_docstring +class CvtPreTrainedModel(PreTrainedModel): + config: CvtConfig + base_model_prefix = "cvt" + main_input_name = "pixel_values" + _no_split_modules = ["CvtLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, CvtStage): + if self.config.cls_token[module.stage]: + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data, mean=0.0, std=self.config.initializer_range + ) + + +@auto_docstring +class CvtModel(CvtPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + self.encoder = CvtEncoder(config) + self.post_init() + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithCLSToken]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + encoder_outputs = self.encoder( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutputWithCLSToken( + last_hidden_state=sequence_output, + cls_token_value=encoder_outputs.cls_token_value, + hidden_states=encoder_outputs.hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + """ +) +class CvtForImageClassification(CvtPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.num_labels = config.num_labels + self.cvt = CvtModel(config, add_pooling_layer=False) + self.layernorm = nn.LayerNorm(config.embed_dim[-1]) + # Classifier head + self.classifier = ( + nn.Linear(config.embed_dim[-1], config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutputWithNoAttention]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.cvt( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + cls_token = outputs[1] + if self.config.cls_token[-1]: + sequence_output = self.layernorm(cls_token) + else: + batch_size, num_channels, height, width = sequence_output.shape + # rearrange "b c h w -> b (h w) c" + sequence_output = sequence_output.view(batch_size, num_channels, height * width).permute(0, 2, 1) + sequence_output = self.layernorm(sequence_output) + + sequence_output_mean = sequence_output.mean(dim=1) + logits = self.classifier(sequence_output_mean) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + +__all__ = ["CvtForImageClassification", "CvtModel", "CvtPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.py new file mode 100644 index 0000000000000000000000000000000000000000..9239e1918eeca7588e0a978af4ef524380cd3f5f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.py @@ -0,0 +1,1095 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 Cvt model.""" + +from __future__ import annotations + +import collections.abc +from dataclasses import dataclass + +import tensorflow as tf + +from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_cvt import CvtConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "CvtConfig" + + +@dataclass +class TFBaseModelOutputWithCLSToken(ModelOutput): + """ + Base class for model's outputs. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`): + Classification token at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus + the initial embedding outputs. + """ + + last_hidden_state: tf.Tensor | None = None + cls_token_value: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor, ...] | None = None + + +class TFCvtDropPath(keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_prob: float, **kwargs): + super().__init__(**kwargs) + self.drop_prob = drop_prob + + def call(self, x: tf.Tensor, training=None): + if self.drop_prob == 0.0 or not training: + return x + keep_prob = 1 - self.drop_prob + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + + +class TFCvtEmbeddings(keras.layers.Layer): + """Construct the Convolutional Token Embeddings.""" + + def __init__( + self, + config: CvtConfig, + patch_size: int, + num_channels: int, + embed_dim: int, + stride: int, + padding: int, + dropout_rate: float, + **kwargs, + ): + super().__init__(**kwargs) + self.convolution_embeddings = TFCvtConvEmbeddings( + config, + patch_size=patch_size, + num_channels=num_channels, + embed_dim=embed_dim, + stride=stride, + padding=padding, + name="convolution_embeddings", + ) + self.dropout = keras.layers.Dropout(dropout_rate) + + def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution_embeddings(pixel_values) + hidden_state = self.dropout(hidden_state, training=training) + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution_embeddings", None) is not None: + with tf.name_scope(self.convolution_embeddings.name): + self.convolution_embeddings.build(None) + + +class TFCvtConvEmbeddings(keras.layers.Layer): + """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.""" + + def __init__( + self, + config: CvtConfig, + patch_size: int, + num_channels: int, + embed_dim: int, + stride: int, + padding: int, + **kwargs, + ): + super().__init__(**kwargs) + self.padding = keras.layers.ZeroPadding2D(padding=padding) + self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + self.projection = keras.layers.Conv2D( + filters=embed_dim, + kernel_size=patch_size, + strides=stride, + padding="valid", + data_format="channels_last", + kernel_initializer=get_initializer(config.initializer_range), + name="projection", + ) + # Using the same default epsilon as PyTorch + self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization") + self.num_channels = num_channels + self.embed_dim = embed_dim + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + if isinstance(pixel_values, dict): + pixel_values = pixel_values["pixel_values"] + + pixel_values = self.projection(self.padding(pixel_values)) + + # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels" + batch_size, height, width, num_channels = shape_list(pixel_values) + hidden_size = height * width + pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels)) + pixel_values = self.normalization(pixel_values) + + # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" + pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels)) + return pixel_values + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, None, self.num_channels]) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build([None, None, self.embed_dim]) + + +class TFCvtSelfAttentionConvProjection(keras.layers.Layer): + """Convolutional projection layer.""" + + def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs): + super().__init__(**kwargs) + self.padding = keras.layers.ZeroPadding2D(padding=padding) + self.convolution = keras.layers.Conv2D( + filters=embed_dim, + kernel_size=kernel_size, + kernel_initializer=get_initializer(config.initializer_range), + padding="valid", + strides=stride, + use_bias=False, + name="convolution", + groups=embed_dim, + ) + # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum) + self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") + self.embed_dim = embed_dim + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution(self.padding(hidden_state)) + hidden_state = self.normalization(hidden_state, training=training) + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution", None) is not None: + with tf.name_scope(self.convolution.name): + self.convolution.build([None, None, None, self.embed_dim]) + if getattr(self, "normalization", None) is not None: + with tf.name_scope(self.normalization.name): + self.normalization.build([None, None, None, self.embed_dim]) + + +class TFCvtSelfAttentionLinearProjection(keras.layers.Layer): + """Linear projection layer used to flatten tokens into 1D.""" + + def call(self, hidden_state: tf.Tensor) -> tf.Tensor: + # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels" + batch_size, height, width, num_channels = shape_list(hidden_state) + hidden_size = height * width + hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels)) + return hidden_state + + +class TFCvtSelfAttentionProjection(keras.layers.Layer): + """Convolutional Projection for Attention.""" + + def __init__( + self, + config: CvtConfig, + embed_dim: int, + kernel_size: int, + stride: int, + padding: int, + projection_method: str = "dw_bn", + **kwargs, + ): + super().__init__(**kwargs) + if projection_method == "dw_bn": + self.convolution_projection = TFCvtSelfAttentionConvProjection( + config, embed_dim, kernel_size, stride, padding, name="convolution_projection" + ) + self.linear_projection = TFCvtSelfAttentionLinearProjection() + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution_projection(hidden_state, training=training) + hidden_state = self.linear_projection(hidden_state) + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution_projection", None) is not None: + with tf.name_scope(self.convolution_projection.name): + self.convolution_projection.build(None) + + +class TFCvtSelfAttention(keras.layers.Layer): + """ + Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for + query, key, and value embeddings. + """ + + def __init__( + self, + config: CvtConfig, + num_heads: int, + embed_dim: int, + kernel_size: int, + stride_q: int, + stride_kv: int, + padding_q: int, + padding_kv: int, + qkv_projection_method: str, + qkv_bias: bool, + attention_drop_rate: float, + with_cls_token: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.scale = embed_dim**-0.5 + self.with_cls_token = with_cls_token + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.convolution_projection_query = TFCvtSelfAttentionProjection( + config, + embed_dim, + kernel_size, + stride_q, + padding_q, + projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method, + name="convolution_projection_query", + ) + self.convolution_projection_key = TFCvtSelfAttentionProjection( + config, + embed_dim, + kernel_size, + stride_kv, + padding_kv, + projection_method=qkv_projection_method, + name="convolution_projection_key", + ) + self.convolution_projection_value = TFCvtSelfAttentionProjection( + config, + embed_dim, + kernel_size, + stride_kv, + padding_kv, + projection_method=qkv_projection_method, + name="convolution_projection_value", + ) + + self.projection_query = keras.layers.Dense( + units=embed_dim, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=qkv_bias, + bias_initializer="zeros", + name="projection_query", + ) + self.projection_key = keras.layers.Dense( + units=embed_dim, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=qkv_bias, + bias_initializer="zeros", + name="projection_key", + ) + self.projection_value = keras.layers.Dense( + units=embed_dim, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=qkv_bias, + bias_initializer="zeros", + name="projection_value", + ) + self.dropout = keras.layers.Dropout(attention_drop_rate) + + def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor: + batch_size, hidden_size, _ = shape_list(hidden_state) + head_dim = self.embed_dim // self.num_heads + hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim)) + hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3)) + return hidden_state + + def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: + if self.with_cls_token: + cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1) + + # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" + batch_size, hidden_size, num_channels = shape_list(hidden_state) + hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) + + key = self.convolution_projection_key(hidden_state, training=training) + query = self.convolution_projection_query(hidden_state, training=training) + value = self.convolution_projection_value(hidden_state, training=training) + + if self.with_cls_token: + query = tf.concat((cls_token, query), axis=1) + key = tf.concat((cls_token, key), axis=1) + value = tf.concat((cls_token, value), axis=1) + + head_dim = self.embed_dim // self.num_heads + + query = self.rearrange_for_multi_head_attention(self.projection_query(query)) + key = self.rearrange_for_multi_head_attention(self.projection_key(key)) + value = self.rearrange_for_multi_head_attention(self.projection_value(value)) + + attention_score = tf.matmul(query, key, transpose_b=True) * self.scale + attention_probs = stable_softmax(logits=attention_score, axis=-1) + attention_probs = self.dropout(attention_probs, training=training) + + context = tf.matmul(attention_probs, value) + # "batch_size, num_heads, hidden_size, head_dim -> batch_size, hidden_size, (num_heads*head_dim)" + _, _, hidden_size, _ = shape_list(context) + context = tf.transpose(context, perm=(0, 2, 1, 3)) + context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim)) + return context + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution_projection_query", None) is not None: + with tf.name_scope(self.convolution_projection_query.name): + self.convolution_projection_query.build(None) + if getattr(self, "convolution_projection_key", None) is not None: + with tf.name_scope(self.convolution_projection_key.name): + self.convolution_projection_key.build(None) + if getattr(self, "convolution_projection_value", None) is not None: + with tf.name_scope(self.convolution_projection_value.name): + self.convolution_projection_value.build(None) + if getattr(self, "projection_query", None) is not None: + with tf.name_scope(self.projection_query.name): + self.projection_query.build([None, None, self.embed_dim]) + if getattr(self, "projection_key", None) is not None: + with tf.name_scope(self.projection_key.name): + self.projection_key.build([None, None, self.embed_dim]) + if getattr(self, "projection_value", None) is not None: + with tf.name_scope(self.projection_value.name): + self.projection_value.build([None, None, self.embed_dim]) + + +class TFCvtSelfOutput(keras.layers.Layer): + """Output of the Attention layer .""" + + def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense( + units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = keras.layers.Dropout(drop_rate) + self.embed_dim = embed_dim + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.dense(inputs=hidden_state) + hidden_state = self.dropout(inputs=hidden_state, training=training) + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.embed_dim]) + + +class TFCvtAttention(keras.layers.Layer): + """Attention layer. First chunk of the convolutional transformer block.""" + + def __init__( + self, + config: CvtConfig, + num_heads: int, + embed_dim: int, + kernel_size: int, + stride_q: int, + stride_kv: int, + padding_q: int, + padding_kv: int, + qkv_projection_method: str, + qkv_bias: bool, + attention_drop_rate: float, + drop_rate: float, + with_cls_token: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.attention = TFCvtSelfAttention( + config, + num_heads, + embed_dim, + kernel_size, + stride_q, + stride_kv, + padding_q, + padding_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + with_cls_token, + name="attention", + ) + self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False): + self_output = self.attention(hidden_state, height, width, training=training) + attention_output = self.dense_output(self_output, training=training) + return attention_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +class TFCvtIntermediate(keras.layers.Layer): + """Intermediate dense layer. Second chunk of the convolutional transformer block.""" + + def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense( + units=int(embed_dim * mlp_ratio), + kernel_initializer=get_initializer(config.initializer_range), + activation="gelu", + name="dense", + ) + self.embed_dim = embed_dim + + def call(self, hidden_state: tf.Tensor) -> tf.Tensor: + hidden_state = self.dense(hidden_state) + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.embed_dim]) + + +class TFCvtOutput(keras.layers.Layer): + """ + Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection. + """ + + def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense( + units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = keras.layers.Dropout(drop_rate) + self.embed_dim = embed_dim + self.mlp_ratio = mlp_ratio + + def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.dense(inputs=hidden_state) + hidden_state = self.dropout(inputs=hidden_state, training=training) + hidden_state = hidden_state + input_tensor + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)]) + + +class TFCvtLayer(keras.layers.Layer): + """ + Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It + consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the + `Block` class in the original implementation. + """ + + def __init__( + self, + config: CvtConfig, + num_heads: int, + embed_dim: int, + kernel_size: int, + stride_q: int, + stride_kv: int, + padding_q: int, + padding_kv: int, + qkv_projection_method: str, + qkv_bias: bool, + attention_drop_rate: float, + drop_rate: float, + mlp_ratio: float, + drop_path_rate: float, + with_cls_token: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.attention = TFCvtAttention( + config, + num_heads, + embed_dim, + kernel_size, + stride_q, + stride_kv, + padding_q, + padding_kv, + qkv_projection_method, + qkv_bias, + attention_drop_rate, + drop_rate, + with_cls_token, + name="attention", + ) + self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate") + self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output") + # Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour. + self.drop_path = ( + TFCvtDropPath(drop_path_rate, name="drop_path") + if drop_path_rate > 0.0 + else keras.layers.Activation("linear", name="drop_path") + ) + # Using the same default epsilon as PyTorch + self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before") + self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after") + self.embed_dim = embed_dim + + def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: + # in Cvt, layernorm is applied before self-attention + attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training) + attention_output = self.drop_path(attention_output, training=training) + + # first residual connection + hidden_state = attention_output + hidden_state + + # in Cvt, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_state) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.dense_output(layer_output, hidden_state) + layer_output = self.drop_path(layer_output, training=training) + return layer_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + if getattr(self, "layernorm_before", None) is not None: + with tf.name_scope(self.layernorm_before.name): + self.layernorm_before.build([None, None, self.embed_dim]) + if getattr(self, "layernorm_after", None) is not None: + with tf.name_scope(self.layernorm_after.name): + self.layernorm_after.build([None, None, self.embed_dim]) + + +class TFCvtStage(keras.layers.Layer): + """ + Cvt stage (encoder block). Each stage has 2 parts : + - (1) A Convolutional Token Embedding layer + - (2) A Convolutional Transformer Block (layer). + The classification token is added only in the last stage. + + Args: + config ([`CvtConfig`]): Model configuration class. + stage (`int`): Stage number. + """ + + def __init__(self, config: CvtConfig, stage: int, **kwargs): + super().__init__(**kwargs) + self.config = config + self.stage = stage + if self.config.cls_token[self.stage]: + self.cls_token = self.add_weight( + shape=(1, 1, self.config.embed_dim[-1]), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="cvt.encoder.stages.2.cls_token", + ) + + self.embedding = TFCvtEmbeddings( + self.config, + patch_size=config.patch_sizes[self.stage], + num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1], + stride=config.patch_stride[self.stage], + embed_dim=config.embed_dim[self.stage], + padding=config.patch_padding[self.stage], + dropout_rate=config.drop_rate[self.stage], + name="embedding", + ) + + drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage]) + drop_path_rates = [x.numpy().item() for x in drop_path_rates] + self.layers = [ + TFCvtLayer( + config, + num_heads=config.num_heads[self.stage], + embed_dim=config.embed_dim[self.stage], + kernel_size=config.kernel_qkv[self.stage], + stride_q=config.stride_q[self.stage], + stride_kv=config.stride_kv[self.stage], + padding_q=config.padding_q[self.stage], + padding_kv=config.padding_kv[self.stage], + qkv_projection_method=config.qkv_projection_method[self.stage], + qkv_bias=config.qkv_bias[self.stage], + attention_drop_rate=config.attention_drop_rate[self.stage], + drop_rate=config.drop_rate[self.stage], + mlp_ratio=config.mlp_ratio[self.stage], + drop_path_rate=drop_path_rates[self.stage], + with_cls_token=config.cls_token[self.stage], + name=f"layers.{j}", + ) + for j in range(config.depth[self.stage]) + ] + + def call(self, hidden_state: tf.Tensor, training: bool = False): + cls_token = None + hidden_state = self.embedding(hidden_state, training) + + # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels" + batch_size, height, width, num_channels = shape_list(hidden_state) + hidden_size = height * width + hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels)) + + if self.config.cls_token[self.stage]: + cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0) + hidden_state = tf.concat((cls_token, hidden_state), axis=1) + + for layer in self.layers: + layer_outputs = layer(hidden_state, height, width, training=training) + hidden_state = layer_outputs + + if self.config.cls_token[self.stage]: + cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1) + + # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels" + hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels)) + return hidden_state, cls_token + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedding", None) is not None: + with tf.name_scope(self.embedding.name): + self.embedding.build(None) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFCvtEncoder(keras.layers.Layer): + """ + Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers + (depth) being 1, 2 and 10. + + Args: + config ([`CvtConfig`]): Model configuration class. + """ + + config_class = CvtConfig + + def __init__(self, config: CvtConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.stages = [ + TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth)) + ] + + def call( + self, + pixel_values: TFModelInputType, + output_hidden_states: bool | None = False, + return_dict: bool | None = True, + training: bool | None = False, + ) -> TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]: + all_hidden_states = () if output_hidden_states else None + hidden_state = pixel_values + # When running on CPU, `keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width) + # as input format. So change the input format to (batch_size, height, width, num_channels). + hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1)) + + cls_token = None + for _, (stage_module) in enumerate(self.stages): + hidden_state, cls_token = stage_module(hidden_state, training=training) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + # Change back to (batch_size, num_channels, height, width) format to have uniformity in the modules + hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2)) + if output_hidden_states: + all_hidden_states = tuple(tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states) + + if not return_dict: + return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None) + + return TFBaseModelOutputWithCLSToken( + last_hidden_state=hidden_state, + cls_token_value=cls_token, + hidden_states=all_hidden_states, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "stages", None) is not None: + for layer in self.stages: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFCvtMainLayer(keras.layers.Layer): + """Construct the Cvt model.""" + + config_class = CvtConfig + + def __init__(self, config: CvtConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.encoder = TFCvtEncoder(config, name="encoder") + + @unpack_inputs + def call( + self, + pixel_values: TFModelInputType | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + encoder_outputs = self.encoder( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithCLSToken( + last_hidden_state=sequence_output, + cls_token_value=encoder_outputs.cls_token_value, + hidden_states=encoder_outputs.hidden_states, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + + +class TFCvtPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CvtConfig + base_model_prefix = "cvt" + main_input_name = "pixel_values" + + +TFCVT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`keras.Model.fit`] method which currently requires having all the + tensors in the first argument of the model call function: `model(inputs)`. + + + + Args: + config ([`CvtConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TFCVT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`] + for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.", + TFCVT_START_DOCSTRING, +) +class TFCvtModel(TFCvtPreTrainedModel): + def __init__(self, config: CvtConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.cvt = TFCvtMainLayer(config, name="cvt") + + @unpack_inputs + @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: tf.Tensor | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, TFCvtModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13") + >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13") + + >>> inputs = image_processor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + outputs = self.cvt( + pixel_values=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return (outputs[0],) + outputs[1:] + + return TFBaseModelOutputWithCLSToken( + last_hidden_state=outputs.last_hidden_state, + cls_token_value=outputs.cls_token_value, + hidden_states=outputs.hidden_states, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "cvt", None) is not None: + with tf.name_scope(self.cvt.name): + self.cvt.build(None) + + +@add_start_docstrings( + """ + Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + """, + TFCVT_START_DOCSTRING, +) +class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: CvtConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.cvt = TFCvtMainLayer(config, name="cvt") + # Using same default epsilon as in the original implementation. + self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm") + + # Classifier head + self.classifier = keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + use_bias=True, + bias_initializer="zeros", + name="classifier", + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: tf.Tensor | None = None, + labels: tf.Tensor | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFImageClassifierOutputWithNoAttention | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, TFCvtForImageClassification + >>> import tensorflow as tf + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13") + >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13") + + >>> inputs = image_processor(images=image, return_tensors="tf") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] + >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) + ```""" + + outputs = self.cvt( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + cls_token = outputs[1] + if self.config.cls_token[-1]: + sequence_output = self.layernorm(cls_token) + else: + # rearrange "batch_size, num_channels, height, width -> batch_size, (height*width), num_channels" + batch_size, num_channels, height, width = shape_list(sequence_output) + sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width)) + sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1)) + sequence_output = self.layernorm(sequence_output) + + sequence_output_mean = tf.reduce_mean(sequence_output, axis=1) + logits = self.classifier(sequence_output_mean) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "cvt", None) is not None: + with tf.name_scope(self.cvt.name): + self.cvt.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.embed_dim[-1]]) + if getattr(self, "classifier", None) is not None: + if hasattr(self.classifier, "name"): + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.embed_dim[-1]]) + + +__all__ = ["TFCvtForImageClassification", "TFCvtModel", "TFCvtPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..40f84ccb59be91ad440ba413366dd726e555ca26 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dac import * + from .feature_extraction_dac import * + from .modeling_dac import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..bdf07cef45bde6a6064d4b6b02eb69f4f077e018 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py @@ -0,0 +1,114 @@ +# coding=utf-8 +# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dac model configuration""" + +import math + +import numpy as np + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DacConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`DacModel`]. It is used to instantiate a + Dac model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the + [descript/dac_16khz](https://huggingface.co/descript/dac_16khz) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + encoder_hidden_size (`int`, *optional*, defaults to 64): + Intermediate representation dimension for the encoder. + downsampling_ratios (`list[int]`, *optional*, defaults to `[2, 4, 8, 8]`): + Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder. + decoder_hidden_size (`int`, *optional*, defaults to 1536): + Intermediate representation dimension for the decoder. + n_codebooks (`int`, *optional*, defaults to 9): + Number of codebooks in the VQVAE. + codebook_size (`int`, *optional*, defaults to 1024): + Number of discrete codes in each codebook. + codebook_dim (`int`, *optional*, defaults to 8): + Dimension of the codebook vectors. If not defined, uses `encoder_hidden_size`. + quantizer_dropout (`bool`, *optional*, defaults to 0): + Whether to apply dropout to the quantizer. + commitment_loss_weight (float, *optional*, defaults to 0.25): + Weight of the commitment loss term in the VQVAE loss function. + codebook_loss_weight (float, *optional*, defaults to 1.0): + Weight of the codebook loss term in the VQVAE loss function. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz). + Example: + + ```python + >>> from transformers import DacModel, DacConfig + + >>> # Initializing a "descript/dac_16khz" style configuration + >>> configuration = DacConfig() + + >>> # Initializing a model (with random weights) from the "descript/dac_16khz" style configuration + >>> model = DacModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dac" + + def __init__( + self, + encoder_hidden_size=64, + downsampling_ratios=[2, 4, 8, 8], + decoder_hidden_size=1536, + n_codebooks=9, + codebook_size=1024, + codebook_dim=8, + quantizer_dropout=0, + commitment_loss_weight=0.25, + codebook_loss_weight=1.0, + sampling_rate=16000, + **kwargs, + ): + self.encoder_hidden_size = encoder_hidden_size + self.downsampling_ratios = downsampling_ratios + self.decoder_hidden_size = decoder_hidden_size + self.upsampling_ratios = downsampling_ratios[::-1] + self.n_codebooks = n_codebooks + self.codebook_size = codebook_size + self.codebook_dim = codebook_dim + self.quantizer_dropout = quantizer_dropout + self.sampling_rate = sampling_rate + + self.hidden_size = encoder_hidden_size * (2 ** len(downsampling_ratios)) + + self.hop_length = int(np.prod(downsampling_ratios)) + self.commitment_loss_weight = commitment_loss_weight + self.codebook_loss_weight = codebook_loss_weight + + super().__init__(**kwargs) + + @property + def frame_rate(self) -> int: + hop_length = np.prod(self.upsampling_ratios) + return math.ceil(self.sampling_rate / hop_length) + + +__all__ = ["DacConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..e81a2466dd91faa91ad9a34590825aa55e9497db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for DAC""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import PaddingStrategy, TensorType, logging + + +logger = logging.get_logger(__name__) + + +class DacFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs an Dac feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. + + Args: + feature_size (`int`, *optional*, defaults to 1): + The feature dimension of the extracted features. Use 1 for mono, 2 for stereo. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz). + padding_value (`float`, *optional*, defaults to 0.0): + The value that is used for padding. + hop_length (`int`, *optional*, defaults to 512): + Overlap length between successive windows. + """ + + model_input_names = ["input_values", "n_quantizers"] + + def __init__( + self, + feature_size: int = 1, + sampling_rate: int = 16000, + padding_value: float = 0.0, + hop_length: int = 512, + **kwargs, + ): + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + self.hop_length = hop_length + + def __call__( + self, + raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + padding: Optional[Union[bool, str, PaddingStrategy]] = None, + truncation: Optional[bool] = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + sampling_rate: Optional[int] = None, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). + + Args: + raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`): + The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape + `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio + (`feature_size = 2`). + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, *optional*, defaults to `False`): + Activates truncation to cut input sequences longer than `max_length` to `max_length`. + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + sampling_rate (`int`, *optional*): + The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + """ + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of" + f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with" + f" {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + if padding and truncation: + raise ValueError("Both padding and truncation were set. Make sure you only set one.") + elif padding is None: + # by default let's pad the inputs + padding = True + + is_batched = bool( + isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list))) + ) + + if is_batched: + raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio] + elif not is_batched and not isinstance(raw_audio, np.ndarray): + raw_audio = np.asarray(raw_audio, dtype=np.float32) + elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64): + raw_audio = raw_audio.astype(np.float32) + + # always return batch + if not is_batched: + raw_audio = [np.asarray(raw_audio).T] + + # verify inputs are valid + for idx, example in enumerate(raw_audio): + if example.ndim > 2: + raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}") + if self.feature_size == 1 and example.ndim != 1: + raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels") + if self.feature_size == 2: + raise ValueError("Stereo audio isn't supported for now") + + input_values = BatchFeature({"input_values": raw_audio}) + + # normal padding on batch + padded_inputs = self.pad( + input_values, + max_length=max_length, + truncation=truncation, + padding=padding, + return_attention_mask=padding, + pad_to_multiple_of=self.hop_length, + ) + if padding: + padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask") + if padding: + padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :] + + input_values = [] + for example in padded_inputs.pop("input_values"): + if self.feature_size == 1: + example = example[..., None] + input_values.append(example.T) + + padded_inputs["input_values"] = input_values + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + return padded_inputs + + +__all__ = ["DacFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..e97c8183651e1fa45d0c0886e258aa564b701239 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py @@ -0,0 +1,685 @@ +# coding=utf-8 +# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Transformers DAC model.""" + +import math +from dataclasses import dataclass +from typing import Optional + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ...modeling_utils import PreTrainedAudioTokenizerBase +from ...utils import ModelOutput, auto_docstring +from .configuration_dac import DacConfig + + +@dataclass +@auto_docstring +class DacOutput(ModelOutput): + r""" + loss (`torch.Tensor`): + Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses. + audio_values (`torch.Tensor` of shape `(batch_size, input_length)`): + Reconstructed audio data. + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized continuous representation of input. + audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`): + Codebook indices for each codebook (quantized discrete representation of input). + projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`): + Projected latents (continuous representation of input before quantization). + """ + + loss: Optional[torch.FloatTensor] = None + audio_values: Optional[torch.FloatTensor] = None + quantized_representation: Optional[torch.FloatTensor] = None + audio_codes: Optional[torch.LongTensor] = None + projected_latents: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring +class DacEncoderOutput(ModelOutput): + r""" + loss (`torch.Tensor`): + Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses. + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*): + Quantized continuous representation of input. + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*): + Codebook indices for each codebook (quantized discrete representation of input). + projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*): + Projected latents (continuous representation of input before quantization). + """ + + loss: Optional[torch.FloatTensor] = None + quantized_representation: Optional[torch.FloatTensor] = None + audio_codes: Optional[torch.FloatTensor] = None + projected_latents: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring +# Copied from transformers.models.encodec.modeling_encodec.EncodecDecoderOutput with Encodec->Dac, segment_length->input_length +class DacDecoderOutput(ModelOutput): + r""" + audio_values (`torch.FloatTensor` of shape `(batch_size, input_length)`, *optional*): + Decoded audio values, obtained using the decoder part of Dac. + """ + + audio_values: Optional[torch.FloatTensor] = None + + +class Snake1d(nn.Module): + """ + A 1-dimensional Snake activation function module. + """ + + def __init__(self, hidden_dim): + super().__init__() + self.alpha = nn.Parameter(torch.ones(1, hidden_dim, 1)) + + def forward(self, hidden_states): + shape = hidden_states.shape + hidden_states = hidden_states.reshape(shape[0], shape[1], -1) + hidden_states = hidden_states + (self.alpha + 1e-9).reciprocal() * torch.sin(self.alpha * hidden_states).pow(2) + hidden_states = hidden_states.reshape(shape) + return hidden_states + + +class DacVectorQuantize(nn.Module): + """ + Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization) + + Additionally uses following tricks from improved VQGAN + (https://huggingface.co/papers/2110.04627): + 1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space + for improved codebook usage + 2. l2-normalized codes: Converts euclidean distance to cosine similarity which + improves training stability + """ + + def __init__(self, config: DacConfig): + super().__init__() + + self.codebook_dim = config.codebook_dim + self.in_proj = nn.Conv1d(config.hidden_size, config.codebook_dim, kernel_size=1) + self.out_proj = nn.Conv1d(config.codebook_dim, config.hidden_size, kernel_size=1) + self.codebook = nn.Embedding(config.codebook_size, config.codebook_dim) + + def forward(self, hidden_state): + """ + Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors. + + Args: + hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`): + Input tensor. + + Returns: + quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`): + Quantized continuous representation of input. + commitment_loss (`torch.FloatTensor`of shape `(1)`): + Commitment loss to train encoder to predict vectors closer to codebook entries. + codebook_loss (`torch.FloatTensor`of shape `(1)`): + Codebook loss to update the codebook. + audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`): + Codebook indices for each codebook, quantized discrete representation of input. + projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`): + Projected latents (continuous representation of input before quantization). + """ + + projected_latents = self.in_proj(hidden_state) + quantized_representation, audio_codes = self.decode_latents(projected_latents) + + commitment_loss = F.mse_loss(projected_latents, quantized_representation.detach(), reduction="mean") + codebook_loss = F.mse_loss(quantized_representation, projected_latents.detach(), reduction="mean") + # noop in forward pass, straight-through gradient estimator in backward pass + quantized_representation = projected_latents + (quantized_representation - projected_latents).detach() + quantized_representation = self.out_proj(quantized_representation) + + return quantized_representation, commitment_loss, codebook_loss, audio_codes, projected_latents + + def decode_latents(self, hidden_states): + batch_size, hidden_dim, sequence_length = hidden_states.shape + encodings = hidden_states.permute(0, 2, 1).reshape(batch_size * sequence_length, hidden_dim) + codebook = self.codebook.weight # codebook: (N x D) + + # L2 normalize encodings and codebook (ViT-VQGAN) + encodings = F.normalize(encodings) + codebook = F.normalize(codebook) + + # Compute euclidean distance with codebook + l2_norm = encodings.pow(2).sum(1, keepdim=True) + dist = -(l2_norm - 2 * encodings @ codebook.t()) + codebook.pow(2).sum(1, keepdim=True).t() + + indices = dist.max(1)[1] + indices = indices.reshape(hidden_states.size(0), -1) + quantized_representation = self.codebook(indices).transpose(1, 2) + return quantized_representation, indices + + +class DacResidualUnit(nn.Module): + """ + A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations. + """ + + def __init__(self, dimension: int = 16, dilation: int = 1): + super().__init__() + pad = ((7 - 1) * dilation) // 2 + + self.snake1 = Snake1d(dimension) + self.conv1 = nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad) + self.snake2 = Snake1d(dimension) + self.conv2 = nn.Conv1d(dimension, dimension, kernel_size=1) + + def forward(self, hidden_state): + """ + Forward pass through the residual unit. + + Args: + hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`): + Input tensor . + + Returns: + output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`): + Input tensor after passing through the residual unit. + """ + output_tensor = hidden_state + output_tensor = self.conv1(self.snake1(output_tensor)) + output_tensor = self.conv2(self.snake2(output_tensor)) + + padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2 + if padding > 0: + hidden_state = hidden_state[..., padding:-padding] + output_tensor = hidden_state + output_tensor + return output_tensor + + +class DacEncoderBlock(nn.Module): + """Encoder block used in DAC encoder.""" + + def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1): + super().__init__() + + dimension = config.encoder_hidden_size * 2**stride_index + self.res_unit1 = DacResidualUnit(dimension // 2, dilation=1) + self.res_unit2 = DacResidualUnit(dimension // 2, dilation=3) + self.res_unit3 = DacResidualUnit(dimension // 2, dilation=9) + self.snake1 = Snake1d(dimension // 2) + self.conv1 = nn.Conv1d( + dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2) + ) + + def forward(self, hidden_state): + hidden_state = self.res_unit1(hidden_state) + hidden_state = self.res_unit2(hidden_state) + hidden_state = self.snake1(self.res_unit3(hidden_state)) + hidden_state = self.conv1(hidden_state) + + return hidden_state + + +class DacDecoderBlock(nn.Module): + """Decoder block used in DAC decoder.""" + + def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1): + super().__init__() + + input_dim = config.decoder_hidden_size // 2**stride_index + output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1) + self.snake1 = Snake1d(input_dim) + self.conv_t1 = nn.ConvTranspose1d( + input_dim, + output_dim, + kernel_size=2 * stride, + stride=stride, + padding=math.ceil(stride / 2), + ) + + self.res_unit1 = DacResidualUnit(output_dim, dilation=1) + self.res_unit2 = DacResidualUnit(output_dim, dilation=3) + self.res_unit3 = DacResidualUnit(output_dim, dilation=9) + + def forward(self, hidden_state): + hidden_state = self.snake1(hidden_state) + hidden_state = self.conv_t1(hidden_state) + hidden_state = self.res_unit1(hidden_state) + hidden_state = self.res_unit2(hidden_state) + hidden_state = self.res_unit3(hidden_state) + + return hidden_state + + +class DacResidualVectorQuantize(nn.Module): + """ + ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://huggingface.co/papers/2107.03312) + """ + + def __init__(self, config: DacConfig): + super().__init__() + + n_codebooks = config.n_codebooks + quantizer_dropout = config.quantizer_dropout + + self.n_codebooks = n_codebooks + + self.quantizers = nn.ModuleList([DacVectorQuantize(config) for i in range(config.n_codebooks)]) + self.quantizer_dropout = quantizer_dropout + + def forward(self, hidden_state, n_quantizers: Optional[int] = None): + """ + Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors. + Args: + hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Input tensor to be quantized. + n_quantizers (`int`, *optional*): + Number of quantizers to use. If specified and `self.quantizer_dropout` is True, + this argument is ignored during training, and a random number of quantizers is used. + + Returns: + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized continuous representation of input. + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`): + Codebook indices for each codebook (quantized discrete representation of input). + projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`): + Projected latents (continuous representation of input before quantization). + commitment_loss (`torch.Tensor` of shape `(1)`): + Commitment loss to train the encoder to predict vectors closer to codebook entries. + codebook_loss (`torch.Tensor` of shape `(1)`): + Codebook loss to update the codebook. + """ + + quantized_representation = 0 + residual = hidden_state + commitment_loss = 0 + codebook_loss = 0 + + audio_codes = [] + projected_latents = [] + + n_quantizers = n_quantizers if n_quantizers is not None else self.n_codebooks + if self.training: + n_quantizers = torch.ones((hidden_state.shape[0],)) * self.n_codebooks + 1 + dropout = torch.randint(1, self.n_codebooks + 1, (hidden_state.shape[0],)) + n_dropout = int(hidden_state.shape[0] * self.quantizer_dropout) + n_quantizers[:n_dropout] = dropout[:n_dropout] + n_quantizers = n_quantizers.to(hidden_state.device) + + for i, quantizer in enumerate(self.quantizers): + if self.training is False and i >= n_quantizers: + break + + quantized_representation_i, commitment_loss_i, codebook_loss_i, indices_i, projected_latents_i = quantizer( + residual + ) + + # Create mask to apply quantizer dropout + mask = torch.full((hidden_state.shape[0],), fill_value=i, device=hidden_state.device) < n_quantizers + quantized_representation = quantized_representation + quantized_representation_i * mask[:, None, None] + residual = residual - quantized_representation_i + + # Sum losses + commitment_loss += commitment_loss_i * mask + codebook_loss += codebook_loss_i * mask + + audio_codes.append(indices_i) + projected_latents.append(projected_latents_i) + + audio_codes = torch.stack(audio_codes, dim=1) + projected_latents = torch.cat(projected_latents, dim=1) + + return quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss + + def from_codes(self, audio_codes: torch.Tensor): + """ + Reconstructs the continuous representation from quantized codes. + + Args: + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`): + Quantized discrete representation of input. + + Returns: + quantized_representation (`torch.Tensor`): + Quantized continuous representation of input. + projected_latents (`torch.Tensor`): + List of projected latents (continuous representations of input before quantization) + for each codebook. + audio_codes (`torch.Tensor`): + Codebook indices for each codebook. + """ + quantized_representation = 0.0 + projected_latents = [] + n_codebooks = audio_codes.shape[1] + for i in range(n_codebooks): + projected_latents_i = self.quantizers[i].codebook(audio_codes[:, i, :]).transpose(1, 2) + projected_latents.append(projected_latents_i) + quantized_representation += self.quantizers[i].out_proj(projected_latents_i) + return quantized_representation, torch.cat(projected_latents, dim=1), audio_codes + + def from_latents(self, latents: torch.Tensor): + """Reconstructs the quantized representation from unquantized latents. + + Args: + latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`): + Continuous representation of input after projection. + + Returns: + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized representation of the full-projected space. + quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized representation of the latent space (continuous representation before quantization). + """ + quantized_representation = 0 + quantized_latents = [] + codes = [] + codebook_dims_tensor = torch.tensor([0] + [q.codebook_dim for q in self.quantizers]) + dims = torch.cumsum(codebook_dims_tensor, dim=0) + + n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0] + for i in range(n_codebooks): + hidden_dim_j, hidden_dim_k = dims[i], dims[i + 1] + quantized_latents_i, codes_i = self.quantizers[i].decode_latents(latents[:, hidden_dim_j:hidden_dim_k, :]) + quantized_latents.append(quantized_latents_i) + codes.append(codes_i) + + quantized_representation_i = self.quantizers[i].out_proj(quantized_latents_i) + quantized_representation = quantized_representation + quantized_representation_i + + return quantized_representation, torch.cat(quantized_latents, dim=1) + + +class DacDecoder(nn.Module): + """DAC Decoder""" + + def __init__(self, config: DacConfig): + super().__init__() + + input_channel = config.hidden_size + channels = config.decoder_hidden_size + strides = config.upsampling_ratios + + # Add first conv layer + self.conv1 = nn.Conv1d(input_channel, channels, kernel_size=7, padding=3) + + # Add upsampling + MRF blocks + block = [] + for stride_index, stride in enumerate(strides): + block += [DacDecoderBlock(config, stride, stride_index)] + + self.block = nn.ModuleList(block) + output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1) + self.snake1 = Snake1d(output_dim) + self.conv2 = nn.Conv1d(output_dim, 1, kernel_size=7, padding=3) + self.tanh = nn.Tanh() + + def forward(self, hidden_state): + hidden_state = self.conv1(hidden_state) + + for layer in self.block: + hidden_state = layer(hidden_state) + + hidden_state = self.snake1(hidden_state) + hidden_state = self.conv2(hidden_state) + hidden_state = self.tanh(hidden_state) + + return hidden_state + + +class DacEncoder(nn.Module): + """DAC Encoder""" + + def __init__(self, config: DacConfig): + super().__init__() + + strides = config.downsampling_ratios + # Create first convolution + self.conv1 = nn.Conv1d(1, config.encoder_hidden_size, kernel_size=7, padding=3) + + self.block = [] + # Create EncoderBlocks that double channels as they downsample by `stride` + for stride_index, stride in enumerate(strides): + stride_index = stride_index + 1 + self.block += [DacEncoderBlock(config, stride=stride, stride_index=stride_index)] + + self.block = nn.ModuleList(self.block) + d_model = config.encoder_hidden_size * 2**stride_index + self.snake1 = Snake1d(d_model) + self.conv2 = nn.Conv1d(d_model, config.hidden_size, kernel_size=3, padding=1) + + def forward(self, hidden_state): + hidden_state = self.conv1(hidden_state) + + for module in self.block: + hidden_state = module(hidden_state) + + hidden_state = self.snake1(hidden_state) + hidden_state = self.conv2(hidden_state) + + return hidden_state + + +@auto_docstring +class DacPreTrainedModel(PreTrainedAudioTokenizerBase): + config: DacConfig + base_model_prefix = "dac" + main_input_name = "input_values" + + def _init_weights(self, module): + if isinstance(module, nn.Conv1d): + nn.init.trunc_normal_(module.weight, std=0.02) + nn.init.constant_(module.bias, 0) + elif isinstance(module, Snake1d): + module.alpha.data.fill_(1.0) + elif isinstance(module, nn.ConvTranspose1d): + module.reset_parameters() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=0.02) + + def apply_weight_norm(self): + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + for layer in self.quantizer.quantizers: + weight_norm(layer.in_proj) + weight_norm(layer.out_proj) + + weight_norm(self.encoder.conv1) + weight_norm(self.encoder.conv2) + + for layer in self.encoder.block: + weight_norm(layer.conv1) + weight_norm(layer.res_unit1.conv1) + weight_norm(layer.res_unit1.conv2) + weight_norm(layer.res_unit2.conv1) + weight_norm(layer.res_unit2.conv2) + weight_norm(layer.res_unit3.conv1) + weight_norm(layer.res_unit3.conv2) + + weight_norm(self.decoder.conv1) + weight_norm(self.decoder.conv2) + + for layer in self.decoder.block: + weight_norm(layer.conv_t1) + weight_norm(layer.res_unit1.conv1) + weight_norm(layer.res_unit1.conv2) + weight_norm(layer.res_unit2.conv1) + weight_norm(layer.res_unit2.conv2) + weight_norm(layer.res_unit3.conv1) + weight_norm(layer.res_unit3.conv2) + + def remove_weight_norm(self): + for layer in self.quantizer.quantizers: + nn.utils.remove_weight_norm(layer.in_proj) + nn.utils.remove_weight_norm(layer.out_proj) + + nn.utils.remove_weight_norm(self.encoder.conv1) + nn.utils.remove_weight_norm(self.encoder.conv2) + + for layer in self.encoder.block: + nn.utils.remove_weight_norm(layer.conv1) + nn.utils.remove_weight_norm(layer.res_unit1.conv1) + nn.utils.remove_weight_norm(layer.res_unit1.conv2) + nn.utils.remove_weight_norm(layer.res_unit2.conv1) + nn.utils.remove_weight_norm(layer.res_unit2.conv2) + nn.utils.remove_weight_norm(layer.res_unit3.conv1) + nn.utils.remove_weight_norm(layer.res_unit3.conv2) + + nn.utils.remove_weight_norm(self.decoder.conv1) + nn.utils.remove_weight_norm(self.decoder.conv2) + + for layer in self.decoder.block: + nn.utils.remove_weight_norm(layer.conv_t1) + nn.utils.remove_weight_norm(layer.res_unit1.conv1) + nn.utils.remove_weight_norm(layer.res_unit1.conv2) + nn.utils.remove_weight_norm(layer.res_unit2.conv1) + nn.utils.remove_weight_norm(layer.res_unit2.conv2) + nn.utils.remove_weight_norm(layer.res_unit3.conv1) + nn.utils.remove_weight_norm(layer.res_unit3.conv2) + + +@auto_docstring( + custom_intro=""" + The DAC (Descript Audio Codec) model. + """ +) +class DacModel(DacPreTrainedModel): + def __init__(self, config: DacConfig): + super().__init__(config) + self.config = config + + self.encoder = DacEncoder(config) + self.decoder = DacDecoder(config) + + self.quantizer = DacResidualVectorQuantize(config) + + self.bits_per_codebook = int(math.log2(self.config.codebook_size)) + if 2**self.bits_per_codebook != self.config.codebook_size: + raise ValueError("The codebook_size must be a power of 2.") + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def encode( + self, + input_values: torch.Tensor, + n_quantizers: Optional[int] = None, + return_dict: Optional[bool] = None, + ): + r""" + input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`): + Input audio data to encode, + n_quantizers (int, *optional*): + Number of quantizers to use. If None, all quantizers are used. Default is None. + """ + return_dict = return_dict if return_dict is not None else self.config.return_dict + + quantized_representation = self.encoder(input_values) + quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss = self.quantizer( + quantized_representation, n_quantizers + ) + + loss = self.config.commitment_loss_weight * commitment_loss + self.config.codebook_loss_weight * codebook_loss + + if not return_dict: + return (loss, quantized_representation, audio_codes, projected_latents) + + return DacEncoderOutput(loss, quantized_representation, audio_codes, projected_latents) + + @auto_docstring + def decode( + self, + quantized_representation: Optional[torch.Tensor] = None, + audio_codes: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ): + r""" + quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*): + Quantized continuous representation of input. + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*): + The codebook indices for each codebook, representing the quantized discrete + representation of the input. This parameter should be provided if you want + to decode directly from the audio codes (it will overwrite quantized_representation). + return_dict (`bool`, *optional*, defaults to `True`): + Whether to return a [`DacDecoderOutput`] instead of a plain tuple. + """ + + if quantized_representation is None and audio_codes is None: + raise ValueError("Either `quantized_representation` or `audio_codes` must be provided.") + + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if audio_codes is not None: + quantized_representation = self.quantizer.from_codes(audio_codes)[0] + + audio_values = self.decoder(quantized_representation).squeeze(1) + + if not return_dict: + return (audio_values,) + + return DacDecoderOutput(audio_values) + + @auto_docstring + def forward( + self, + input_values: torch.Tensor, + n_quantizers: Optional[int] = None, + return_dict: Optional[bool] = None, + ): + r""" + input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`): + Audio data to encode. + n_quantizers (`int`, *optional*): + Number of quantizers to use. If `None`, all quantizers are used. Default is `None`. + + Examples: + + ```python + >>> from datasets import load_dataset, Audio + >>> from transformers import DacModel, AutoProcessor + >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + >>> model = DacModel.from_pretrained("descript/dac_16khz") + >>> processor = AutoProcessor.from_pretrained("descript/dac_16khz") + >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + >>> audio_sample = librispeech_dummy[-1]["audio"]["array"] + >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt") + + >>> encoder_outputs = model.encode(inputs["input_values"]) + >>> # Get the intermediate audio codes + >>> audio_codes = encoder_outputs.audio_codes + >>> # Reconstruct the audio from its quantized representation + >>> audio_values = model.decode(encoder_outputs.quantized_representation) + >>> # or the equivalent with a forward pass + >>> audio_values = model(inputs["input_values"]).audio_values + ```""" + + return_dict = return_dict if return_dict is not None else self.config.return_dict + length = input_values.shape[-1] + + loss, quantized_representation, audio_codes, projected_latents = self.encode( + input_values, n_quantizers, return_dict=False + ) + audio_values = self.decode(quantized_representation, return_dict=False)[0][..., :length] + + if not return_dict: + return (loss, audio_values, quantized_representation, audio_codes, projected_latents) + + return DacOutput(loss, audio_values, quantized_representation, audio_codes, projected_latents) + + +__all__ = ["DacModel", "DacPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19116604f739c69a26cc582f599768e8f111b14e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_audio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6c6023c8671a89b448c4c5597d837e34bb3e71f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_audio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb4536f0fa15ec414e51343ba2a86dbbace70974 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_vision.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ba9dcb5920302549787ed75f3b9c68b9be8a0e3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_vision.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_audio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97c3914b373f4dbc78c61ffb1f4bf458dfbd2f9c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_audio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e393d306b2a881a4b9d615b4ffa7be3e73e1a0a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_vision.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71b52305aee18ed515fdbb1f631f7b3a1bea2368 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_vision.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_tf_data2vec_vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_tf_data2vec_vision.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f804e768e2b834fe1f10cc1c4dfe42160f1ac495 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_tf_data2vec_vision.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modular_data2vec_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modular_data2vec_audio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09052665af1120ac0685fce96052a20b62ecf793 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modular_data2vec_audio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72964d548b78905f10d2b729a7c29c67822ad7d6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/configuration_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/configuration_deberta_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99670af8103c2012aa728e436b41083dbb4124b2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/configuration_deberta_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_deberta_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54f555fad75eb9315ced68afe2308576812fe122 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_deberta_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_tf_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_tf_deberta_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30b3bee5865a5ef47584dc588b5bdbb07e8eb398 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_tf_deberta_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3897946546bf6d334b1fc99f72e3bc84120d6acb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b4b345e1468811044478395000885546da4c73e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..436834c7e5e038b79d9cf6ef21e34ba5eaf6ae64 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py @@ -0,0 +1,157 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decision Transformer model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DecisionTransformerConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to + instantiate a Decision Transformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the standard + DecisionTransformer architecture. Many of the config options are used to instantiate the GPT2 model that is used as + part of the architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + state_dim (`int`, *optional*, defaults to 17): + The state size for the RL environment + act_dim (`int`, *optional*, defaults to 4): + The size of the output action space + hidden_size (`int`, *optional*, defaults to 128): + The size of the hidden layers + max_ep_len (`int`, *optional*, defaults to 4096): + The maximum length of an episode in the environment + action_tanh (`bool`, *optional*, defaults to True): + Whether to use a tanh activation on action prediction + vocab_size (`int`, *optional*, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DecisionTransformerModel`]. + n_positions (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_layer (`int`, *optional*, defaults to 3): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 1): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (`int`, *optional*): + Dimensionality of the inner feed-forward layers. If unset, will default to 4 times `n_embd`. + activation_function (`str`, *optional*, defaults to `"gelu"`): + Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + scale_attn_weights (`bool`, *optional*, defaults to `True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights by `1 / layer_idx + 1`. + reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): + Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention + dot-product/softmax to float() when training with mixed precision. + + Example: + + ```python + >>> from transformers import DecisionTransformerConfig, DecisionTransformerModel + + >>> # Initializing a DecisionTransformer configuration + >>> configuration = DecisionTransformerConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = DecisionTransformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "decision_transformer" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + state_dim=17, + act_dim=4, + hidden_size=128, + max_ep_len=4096, + action_tanh=True, + vocab_size=1, + n_positions=1024, + n_layer=3, + n_head=1, + n_inner=None, + activation_function="relu", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + scale_attn_weights=True, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + scale_attn_by_inverse_layer_idx=False, + reorder_and_upcast_attn=False, + **kwargs, + ): + self.state_dim = state_dim + self.act_dim = act_dim + self.hidden_size = hidden_size + self.max_ep_len = max_ep_len + self.action_tanh = action_tanh + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx + self.reorder_and_upcast_attn = reorder_and_upcast_attn + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +__all__ = ["DecisionTransformerConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f9c68fcbdeaeca9fb89c160de8f440333befd5fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -0,0 +1,934 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DecisionTransformer model.""" + +import math +import os +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer +from ...utils import ( + ModelOutput, + auto_docstring, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_decision_transformer import DecisionTransformerConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.gpt2.modeling_gpt2.load_tf_weights_in_gpt2 +def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): + """Load tf checkpoints in a pytorch model""" + try: + import re + + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(gpt2_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split("/") + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + scope_names = re.split(r"(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "w" or scope_names[0] == "g": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "b": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "wpe" or scope_names[0] == "wte": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +# Copied from transformers.models.gpt2.modeling_gpt2.eager_attention_forward +def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if module.scale_attn_weights: + attn_weights = attn_weights / torch.full( + [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device + ) + + # Layer-wise attention scaling + if module.scale_attn_by_inverse_layer_idx: + attn_weights = attn_weights / float(module.layer_idx + 1) + + if not module.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value) + + if attention_mask is not None: + # Apply the attention mask + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise + attn_weights = attn_weights.type(value.dtype) + attn_weights = module.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2) + + return attn_output, attn_weights + + +# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Attention with GPT2->DecisionTransformerGPT2 +class DecisionTransformerGPT2Attention(nn.Module): + def __init__(self, config, is_cross_attention=False, layer_idx=None): + super().__init__() + self.config = config + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) + self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + self.scale_attn_weights = config.scale_attn_weights + self.is_cross_attention = is_cross_attention + + # Layer-wise attention scaling, reordering, and upcasting + self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx + self.layer_idx = layer_idx + self.reorder_and_upcast_attn = config.reorder_and_upcast_attn + + if self.is_cross_attention: + self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim) + self.q_attn = Conv1D(self.embed_dim, self.embed_dim) + else: + self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim) + self.c_proj = Conv1D(self.embed_dim, self.embed_dim) + + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + self.is_causal = True + + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads)) + self.num_heads = self.num_heads - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None): + # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM) + bsz, num_heads, q_seq_len, dk = query.size() + _, _, k_seq_len, _ = key.size() + + # Preallocate attn_weights for `baddbmm` + attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device) + + # Compute Scale Factor + scale_factor = 1.0 + if self.scale_attn_weights: + scale_factor /= float(value.size(-1)) ** 0.5 + + if self.scale_attn_by_inverse_layer_idx: + scale_factor /= float(self.layer_idx + 1) + + # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk)) + with torch.autocast(query.device.type, enabled=False): + q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len) + attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor) + attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise + if attn_weights.dtype != torch.float32: + raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32") + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2) + + return attn_output, attn_weights + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]: + is_cross_attention = encoder_hidden_states is not None + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + if is_cross_attention: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`." + ) + query_states = self.q_attn(hidden_states) + attention_mask = encoder_attention_mask + + # Try to get key/value states from cache if possible + if past_key_values is not None and is_updated: + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2) + shape_kv = (*key_states.shape[:-1], -1, self.head_dim) + key_states = key_states.view(shape_kv).transpose(1, 2) + value_states = value_states.view(shape_kv).transpose(1, 2) + else: + query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2) + shape_kv = (*key_states.shape[:-1], -1, self.head_dim) + key_states = key_states.view(shape_kv).transpose(1, 2) + value_states = value_states.view(shape_kv).transpose(1, 2) + + shape_q = (*query_states.shape[:-1], -1, self.head_dim) + query_states = query_states.view(shape_q).transpose(1, 2) + + if (past_key_values is not None and not is_cross_attention) or ( + past_key_values is not None and is_cross_attention and not is_updated + ): + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_values.is_updated[self.layer_idx] = True + + is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention + + using_eager = self.config._attn_implementation == "eager" + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + if using_eager and self.reorder_and_upcast_attn: + attn_output, attn_weights = self._upcast_and_reordered_attn( + query_states, key_states, value_states, attention_mask, head_mask + ) + else: + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + head_mask=head_mask, + dropout=self.attn_dropout.p if self.training else 0.0, + is_causal=is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous() + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP with GPT2->DecisionTransformerGPT2 +class DecisionTransformerGPT2MLP(nn.Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = Conv1D(intermediate_size, embed_dim) + self.c_proj = Conv1D(embed_dim, intermediate_size) + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Block with GPT2->DecisionTransformerGPT2 +class DecisionTransformerGPT2Block(GradientCheckpointingLayer): + # Ignore copy + def __init__(self, config, layer_idx=None): + super().__init__() + hidden_size = config.hidden_size + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size + + self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = DecisionTransformerGPT2Attention(config, layer_idx=layer_idx) + self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + if config.add_cross_attention: + self.crossattention = DecisionTransformerGPT2Attention( + config, is_cross_attention=True, layer_idx=layer_idx + ) + self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = DecisionTransformerGPT2MLP(inner_dim, config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_output, self_attn_weights = self.attn( + hidden_states, + past_key_values=past_key_values, + cache_position=cache_position, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs, + ) + # residual connection + hidden_states = attn_output + residual + + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_output, cross_attn_weights = self.crossattention( + hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + # residual connection + hidden_states = residual + cross_attn_output + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + if encoder_hidden_states is not None: + outputs += (cross_attn_weights,) + + return outputs + + +@auto_docstring +class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel): + config: DecisionTransformerConfig + load_tf_weights = load_tf_weights_in_gpt2 + base_model_prefix = "transformer" + is_parallelizable = True + supports_gradient_checkpointing = True + + _can_compile_fullgraph = False + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if "c_proj" in name and "weight" in name: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) + + +class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList( + [DecisionTransformerGPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)] + ) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + + # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder and similar addition in GPT2Model + if use_cache: + if past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache): + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config)) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + + output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, block in enumerate(self.h): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + past_key_values if not (self.gradient_checkpointing and self.training) else None, + cache_position, + attention_mask, + head_mask[i], + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_values = past_key_values if use_cache else None + # no return to legacy cache + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for model's outputs that also contains a pooling of the last hidden states. + """ +) +class DecisionTransformerOutput(ModelOutput): + r""" + state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`): + Environment state predictions + action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`): + Model action predictions + return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`): + Predicted returns for each state + """ + + state_preds: Optional[torch.FloatTensor] = None + action_preds: Optional[torch.FloatTensor] = None + return_preds: Optional[torch.FloatTensor] = None + hidden_states: Optional[torch.FloatTensor] = None + attentions: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + + +class DecisionTransformerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: DecisionTransformerConfig + base_model_prefix = "decision_transformer" + main_input_name = "states" + supports_gradient_checkpointing = False + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring( + custom_intro=""" + The Decision Transformer Model + """ +) +class DecisionTransformerModel(DecisionTransformerPreTrainedModel): + """ + + The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL + setting. Refer to the paper for more details: https://huggingface.co/papers/2106.01345 + + """ + + def __init__(self, config): + super().__init__(config) + self.config = config + self.hidden_size = config.hidden_size + # note: the only difference between this GPT2Model and the default Huggingface version + # is that the positional embeddings are removed (since we'll add those ourselves) + self.encoder = DecisionTransformerGPT2Model(config) + + self.embed_timestep = nn.Embedding(config.max_ep_len, config.hidden_size) + self.embed_return = torch.nn.Linear(1, config.hidden_size) + self.embed_state = torch.nn.Linear(config.state_dim, config.hidden_size) + self.embed_action = torch.nn.Linear(config.act_dim, config.hidden_size) + + self.embed_ln = nn.LayerNorm(config.hidden_size) + + # note: we don't predict states or returns for the paper + self.predict_state = torch.nn.Linear(config.hidden_size, config.state_dim) + self.predict_action = nn.Sequential( + *([nn.Linear(config.hidden_size, config.act_dim)] + ([nn.Tanh()] if config.action_tanh else [])) + ) + self.predict_return = torch.nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + states: Optional[torch.FloatTensor] = None, + actions: Optional[torch.FloatTensor] = None, + rewards: Optional[torch.FloatTensor] = None, + returns_to_go: Optional[torch.FloatTensor] = None, + timesteps: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DecisionTransformerOutput]: + r""" + states (`torch.FloatTensor` of shape `(batch_size, episode_length, state_dim)`): + The states for each step in the trajectory + actions (`torch.FloatTensor` of shape `(batch_size, episode_length, act_dim)`): + The actions taken by the "expert" policy for the current state, these are masked for auto regressive + prediction + rewards (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`): + The rewards for each state, action + returns_to_go (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`): + The returns for each state in the trajectory + timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`): + The timestep for each step in the trajectory + + Examples: + + ```python + >>> from transformers import DecisionTransformerModel + >>> import torch + + >>> model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-medium") + >>> # evaluation + >>> model = model.to(device) + >>> model.eval() + + >>> env = gym.make("Hopper-v3") + >>> state_dim = env.observation_space.shape[0] + >>> act_dim = env.action_space.shape[0] + + >>> state = env.reset() + >>> states = torch.from_numpy(state).reshape(1, 1, state_dim).to(device=device, dtype=torch.float32) + >>> actions = torch.zeros((1, 1, act_dim), device=device, dtype=torch.float32) + >>> rewards = torch.zeros(1, 1, device=device, dtype=torch.float32) + >>> target_return = torch.tensor(TARGET_RETURN, dtype=torch.float32).reshape(1, 1) + >>> timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1) + >>> attention_mask = torch.zeros(1, 1, device=device, dtype=torch.float32) + + >>> # forward pass + >>> with torch.no_grad(): + ... state_preds, action_preds, return_preds = model( + ... states=states, + ... actions=actions, + ... rewards=rewards, + ... returns_to_go=target_return, + ... timesteps=timesteps, + ... attention_mask=attention_mask, + ... return_dict=False, + ... ) + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = states.shape[0], states.shape[1] + + if attention_mask is None: + # attention mask for GPT: 1 if can be attended to, 0 if not + attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long) + + # embed each modality with a different head + state_embeddings = self.embed_state(states) + action_embeddings = self.embed_action(actions) + returns_embeddings = self.embed_return(returns_to_go) + time_embeddings = self.embed_timestep(timesteps) + + # time embeddings are treated similar to positional embeddings + state_embeddings = state_embeddings + time_embeddings + action_embeddings = action_embeddings + time_embeddings + returns_embeddings = returns_embeddings + time_embeddings + + # this makes the sequence look like (R_1, s_1, a_1, R_2, s_2, a_2, ...) + # which works nice in an autoregressive sense since states predict actions + stacked_inputs = ( + torch.stack((returns_embeddings, state_embeddings, action_embeddings), dim=1) + .permute(0, 2, 1, 3) + .reshape(batch_size, 3 * seq_length, self.hidden_size) + ) + stacked_inputs = self.embed_ln(stacked_inputs) + + # to make the attention mask fit the stacked inputs, have to stack it as well + stacked_attention_mask = ( + torch.stack((attention_mask, attention_mask, attention_mask), dim=1) + .permute(0, 2, 1) + .reshape(batch_size, 3 * seq_length) + ) + device = stacked_inputs.device + # we feed in the input embeddings (not word indices as in NLP) to the model + encoder_outputs = self.encoder( + inputs_embeds=stacked_inputs, + attention_mask=stacked_attention_mask, + position_ids=torch.zeros(stacked_attention_mask.shape, device=device, dtype=torch.long), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + x = encoder_outputs[0] + + # reshape x so that the second dimension corresponds to the original + # returns (0), states (1), or actions (2); i.e. x[:,1,t] is the token for s_t + x = x.reshape(batch_size, seq_length, 3, self.hidden_size).permute(0, 2, 1, 3) + + # get predictions + return_preds = self.predict_return(x[:, 2]) # predict next return given state and action + state_preds = self.predict_state(x[:, 2]) # predict next state given state and action + action_preds = self.predict_action(x[:, 1]) # predict next action given state + if not return_dict: + return (state_preds, action_preds, return_preds) + + return DecisionTransformerOutput( + last_hidden_state=encoder_outputs.last_hidden_state, + state_preds=state_preds, + action_preds=action_preds, + return_preds=return_preds, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +__all__ = [ + "DecisionTransformerGPT2Model", + "DecisionTransformerGPT2PreTrainedModel", + "DecisionTransformerModel", + "DecisionTransformerPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..b3abae5af0a7f7613432e71ef13a1c3b9774176c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py @@ -0,0 +1,97 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class DeepseekVLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a + DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVL + [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLConfig, DeepseekVLModel + + >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> configuration = DeepseekVLConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> model = DeepseekVLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + text_config: Optional[AutoConfig] = None, + vision_config: Optional[AutoConfig] = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + +__all__ = ["DeepseekVLConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..12aa7caf892e52262c5d612551a19e8f9bd4f58d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -0,0 +1,425 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class DeepseekVLImageProcessor(BaseImageProcessor): + r""" + Constructs a DEEPSEEK_VL image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to square or not. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + do_pad: Optional[bool] = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + self.do_pad = do_pad + self.min_size = min_size + if image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple(int(x * 255) for x in image_mean) + + def resize( + self, + image: np.ndarray, + size: Union[dict[str, int], int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to dynamically calculated size. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]` or `int`): + The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `None`: will be inferred from input + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + height, width = get_image_size(image, input_data_format) + max_size = max(height, width) + + size = get_size_dict(size, default_to_square=True) + if size["height"] != size["width"]: + raise ValueError( + f"Output height and width must be the same. Got height={size['height']} and width={size['width']}" + ) + size = size["height"] + + delta = size / max_size + # Largest side becomes `size` and the other side is scaled according to the aspect ratio. + output_size_nonpadded = [ + max(int(height * delta), self.min_size), + max(int(width * delta), self.min_size), + ] + + image = resize( + image, + size=output_size_nonpadded, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: Optional[bool] = None, + background_color: Optional[Union[int, tuple[int, int, int]]] = None, + do_pad: Optional[bool] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + background_color (`tuple[int, int, int]`): + The background color to use for the padding. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to square or not. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_pad = do_pad if do_pad is not None else self.do_pad + background_color = background_color if background_color is not None else self.background_color + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_pad: + # Expand and pad the images to obtain a square image of dimensions `size x size` + images = [ + self.pad_to_square( + image=image, + background_color=background_color, + input_data_format=input_data_format, + ) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + def pad_to_square( + self, + image: np.ndarray, + background_color: Union[int, tuple[int, int, int]] = 0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads an image to a square based on the longest edge. + + Args: + image (`np.ndarray`): + The image to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in multi-channel mode, it will default to `0` in subsequent channels. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The padded image. + """ + height, width = get_image_size(image, input_data_format) + num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1] + + if height == width: + image = ( + to_channel_dimension_format(image, data_format, input_data_format) + if data_format is not None + else image + ) + return image + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + if input_data_format == ChannelDimension.FIRST: + result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype) + for i, color in enumerate(background_color): + result[i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + result[:, start : start + height, :] = image + else: + start = (max_dim - width) // 2 + result[:, :, start : start + width] = image + else: + result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype) + for i, color in enumerate(background_color): + result[:, :, i] = color + if width > height: + start = (max_dim - height) // 2 + result[start : start + height, :, :] = image + else: + start = (max_dim - width) // 2 + result[:, start : start + width, :] = image + + return result + + +__all__ = ["DeepseekVLImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..ce884da8d08bb8680152bd2a58e537f876919f0f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -0,0 +1,350 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ..auto import AutoModel +from .configuration_deepseek_vl import DeepseekVLConfig + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVL model's outputs that may also contain a past key/values (to speed up sequential decoding). + """ +) +class DeepseekVLBaseModelOutputWithPast(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVL causal language model (or autoregressive) outputs. + """ +) +class DeepseekVLCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +class DeepseekVLAligner(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + in_features = config.vision_config.hidden_size + out_features = config.text_config.hidden_size + + self.linear1 = nn.Linear(in_features, out_features) + self.activation = nn.GELU() + self.linear2 = nn.Linear(out_features, out_features) + + def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: + x = self.linear1(vision_encodings) + x = self.activation(x) + x = self.linear2(x) + return x + + +@auto_docstring +class DeepseekVLPreTrainedModel(PreTrainedModel): + config: DeepseekVLConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values", "causal_mask"] + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_param_buffer_assignment = False + + def _init_weights(self, module): + """Initialize the weights""" + # Required only for Linear layer in DeepseekVLAligner + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DeepseekVLModel(DeepseekVLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_image_features(self, pixel_values): + image_embeds = self.vision_model(pixel_values) + image_embeds = self.aligner(image_embeds.last_hidden_state) + return image_embeds + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + n_image_features = image_features.shape[0] * image_features.shape[1] + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_embeds = self.get_image_features(pixel_values) + image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1]) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + image_attention_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) + + lm_output = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + return DeepseekVLBaseModelOutputWithPast( + last_hidden_state=lm_output.last_hidden_state, + past_key_values=lm_output.past_key_values, + hidden_states=lm_output.hidden_states, + attentions=lm_output.attentions, + image_hidden_states=image_embeds if pixel_values is not None else None, + ) + + +class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"] + _can_compile_fullgraph = True + + def __init__(self, config: DeepseekVLConfig): + super().__init__(config) + self.config = config + self.model = DeepseekVLModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.model.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.language_model.set_input_embeddings(value) + + def prepare_embeddings_for_image_generation(self) -> torch.Tensor: + raise AttributeError("Not needed for DeepseekVL") + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return DeepseekVLCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + pixel_values=None, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- extra custom processing + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["DeepseekVLPreTrainedModel", "DeepseekVLModel", "DeepseekVLForConditionalGeneration"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..5bfc0ae7d74c3b48f9a009edcc7daba60ccc3382 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -0,0 +1,333 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...configuration_utils import PretrainedConfig +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + auto_docstring, + logging, +) +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel +from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast +from ..janus.image_processing_janus import JanusImageProcessor +from ..janus.image_processing_janus_fast import JanusImageProcessorFast +from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel + + +logger = logging.get_logger(__name__) + + +class DeepseekVLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a + DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVL + [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLConfig, DeepseekVLModel + + >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> configuration = DeepseekVLConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> model = DeepseekVLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + text_config: Optional[AutoConfig] = None, + vision_config: Optional[AutoConfig] = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + +class DeepseekVLBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): + pass + + +class DeepseekVLCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast): + pass + + +class DeepseekVLAligner(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + in_features = config.vision_config.hidden_size + out_features = config.text_config.hidden_size + + self.linear1 = nn.Linear(in_features, out_features) + self.activation = nn.GELU() + self.linear2 = nn.Linear(out_features, out_features) + + def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: + x = self.linear1(vision_encodings) + x = self.activation(x) + x = self.linear2(x) + return x + + +class DeepseekVLPreTrainedModel(JanusPreTrainedModel): + _no_split_modules = ["LlamaDecoderLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + # Required only for Linear layer in DeepseekVLAligner + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DeepseekVLModel(JanusModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + del self.vqmodel + del self.generation_embeddings + del self.generation_aligner + del self.generation_head + + +class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration): + def prepare_embeddings_for_image_generation(self): + raise AttributeError("Not needed for DeepseekVL") + + def decode_image_tokens(self): + raise AttributeError("Not needed for DeepseekVL") + + def generate(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLImageProcessor(JanusImageProcessor): + def __init__(self, **super_kwargs): + super().__init__(**super_kwargs) + + def postprocess(self): + raise AttributeError("Not needed for DeepseekVL") + + def unnormalize(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLImageProcessorFast(JanusImageProcessorFast): + def __init__(self, **super_kwargs): + super().__init__(**super_kwargs) + + def postprocess(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DeepseekVLProcessor(ProcessorMixin): + r""" + Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor. + + [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information. + + Args: + image_processor ([`DeepseekVLImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "num_image_tokens"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_image_tokens=576, + ): + self.image_token = tokenizer.image_token + self.num_image_tokens = num_image_tokens + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[DeepseekVLProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = [ + "DeepseekVLConfig", + "DeepseekVLPreTrainedModel", + "DeepseekVLModel", + "DeepseekVLForConditionalGeneration", + "DeepseekVLImageProcessor", + "DeepseekVLImageProcessorFast", + "DeepseekVLProcessor", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94189b16d9784bf35d24d52c602514f6b387afc3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a4707d31dc609dd91977f3a5ede00a0f0c92d10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d26e0670a59457103cd7c92101a95b182e712b61 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55ac45c6bdc722224bb6f7b612411cd2fc1e28cb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b76c6a46be7da65cb30209dd386edc0e03fc5ccc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9ec83ab52900788222d9b6913b6b2c3930c7640 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17fad64a6922343e6f6472dd637b8d9559a1e9f0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8ef20e7e9a7174f90a132cda36ed9591f211c10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6e06e186741991b2f9183f30b6591477bc60dfb5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deta import * + from .image_processing_deta import * + from .modeling_deta import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/configuration_deta.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/configuration_deta.py new file mode 100644 index 0000000000000000000000000000000000000000..2109902ac06546ce4e235403dd3b95bf27486272 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/configuration_deta.py @@ -0,0 +1,278 @@ +# coding=utf-8 +# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DETA model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging +from ...auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class DetaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DETA + [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): + The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + num_queries (`int`, *optional*, defaults to 900): + Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can + detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + num_feature_levels (`int`, *optional*, defaults to 5): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `True`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + DETA, which are further fed into the decoder for iterative bounding box refinement. + two_stage_num_proposals (`int`, *optional*, defaults to 300): + The number of region proposals to be generated, in case `two_stage` is set to `True`. + with_box_refine (`bool`, *optional*, defaults to `True`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + assign_first_stage (`bool`, *optional*, defaults to `True`): + Whether to assign each prediction i to the highest overlapping ground truth object if the overlap is larger than a threshold 0.7. + assign_second_stage (`bool`, *optional*, defaults to `True`): + Whether to assign second assignment procedure in the second stage closely follows the first stage assignment procedure. + disable_custom_kernels (`bool`, *optional*, defaults to `True`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + + Examples: + + ```python + >>> from transformers import DetaConfig, DetaModel + + >>> # Initializing a DETA SenseTime/deformable-detr style configuration + >>> configuration = DetaConfig() + + >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration + >>> model = DetaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deta" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + backbone_kwargs=None, + num_queries=900, + max_position_embeddings=2048, + encoder_layers=6, + encoder_ffn_dim=2048, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=1024, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + return_intermediate=True, + auxiliary_loss=False, + position_embedding_type="sine", + num_feature_levels=5, + encoder_n_points=4, + decoder_n_points=4, + two_stage=True, + two_stage_num_proposals=300, + with_box_refine=True, + assign_first_stage=True, + assign_second_stage=True, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + focal_alpha=0.25, + disable_custom_kernels=True, + **kwargs, + ): + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"]) + else: + if isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: + raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.backbone_kwargs = backbone_kwargs + self.num_queries = num_queries + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + # deformable attributes + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + self.two_stage_num_proposals = two_stage_num_proposals + self.with_box_refine = with_box_refine + self.assign_first_stage = assign_first_stage + self.assign_second_stage = assign_second_stage + if two_stage is True and with_box_refine is False: + raise ValueError("If two_stage is True, with_box_refine must be True.") + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + +__all__ = ["DetaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/image_processing_deta.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/image_processing_deta.py new file mode 100644 index 0000000000000000000000000000000000000000..434d25a1ab5152b0de5e74463c3fe15eb16f901b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/image_processing_deta.py @@ -0,0 +1,1228 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import pathlib +from collections.abc import Iterable +from typing import Any, Callable, Optional, Union + +import numpy as np + +from ....feature_extraction_utils import BatchFeature +from ....image_processing_utils import BaseImageProcessor, get_size_dict +from ....image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ....image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_batched, + is_scaled_image, + to_numpy_array, + valid_images, + validate_annotations, + validate_preprocess_arguments, +) +from ....utils import ( + is_flax_available, + is_jax_tensor, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_torchvision_available, + is_vision_available, + logging, +) +from ....utils.generic import TensorType + + +if is_torch_available(): + import torch + + +if is_torchvision_available(): + from torchvision.ops.boxes import batched_nms + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + raw_size = None + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) + + if (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + elif width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) + else: + oh = size + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + + return (oh, ow) + + +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, tuple[int, int], list[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `tuple[int, int]` or `list[int]`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +def max_across_indices(values: Iterable[Any]) -> list[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +def get_max_height_width( + images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> list[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +def make_pixel_mask( + image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`list[list[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DETA. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array + keypoints = np.asarray(keypoints, dtype=np.float32) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width) + new_target["masks"] = masks[keep] + + return new_target + + +def masks_to_boxes(masks: np.ndarray) -> np.ndarray: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +def prepare_coco_panoptic_annotation( + image: np.ndarray, + target: dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> dict: + """ + Prepare a coco panoptic annotation for DETA. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64) + new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64) + new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64) + + if "segments_info" in target: + masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([segment_info["id"] for segment_info in target["segments_info"]]) + masks = masks == ids[:, None, None] + masks = masks.astype(np.uint8) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = np.array( + [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["iscrowd"] = np.asarray( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["area"] = np.asarray( + [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32 + ) + + return new_target + + +def resize_annotation( + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +class DetaImageProcessor(BaseImageProcessor): + r""" + Constructs a Deformable DETR image processor. + + Args: + format (`str`, *optional*, defaults to `"coco_detection"`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_annotations: bool = True, + do_pad: bool = True, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, default_to_square=False) + + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + + def prepare_annotation( + self, + image: np.ndarray, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into DETA model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input + image. + """ + size = get_size_dict(size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + new_size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + new_size = (size["height"], size["width"]) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format + ) + return image + + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format and from absolute to relative pixel values. + """ + return normalize_annotation(annotation, image_size=image_size) + + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = pad( + masks, + padding, + mode=PaddingMode.CONSTANT, + constant_values=0, + input_data_format=ChannelDimension.FIRST, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= np.asarray( + [ + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + ] + ) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + def _pad_image( + self, + image: np.ndarray, + output_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes + ) + return padded_image, annotation + + def pad( + self, + images: list[np.ndarray], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + pad_size: Optional[dict[str, int]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + images (list[`np.ndarray`]): + Images to pad. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + update_bboxes (`bool`, *optional*, defaults to `True`): + Whether to update the bounding boxes in the annotations to match the padded images. If the + bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` + format, the bounding boxes will not be updated. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) + + annotation_list = annotations if annotations is not None else [None] * len(images) + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotation_list): + padded_image, padded_annotation = self._pad_image( + image, + padded_size, + annotation, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=update_bboxes, + ) + padded_images.append(padded_image) + padded_annotations.append(padded_annotation) + + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations + ] + + return encoded_inputs + + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[list[dict], list[list[dict]]]] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`list[Dict]` or `list[list[Dict]]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `list[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead.", + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + + # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if not is_batched(images): + images = [images] + annotations = [annotations] if annotations is not None else None + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + + if do_convert_annotations and annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + encoded_inputs = self.pad( + images, + annotations=annotations, + return_pixel_mask=True, + data_format=data_format, + input_data_format=input_data_format, + return_tensors=return_tensors, + update_bboxes=do_convert_annotations, + pad_size=pad_size, + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + def post_process_object_detection( + self, + outputs, + threshold: float = 0.5, + target_sizes: Union[TensorType, list[tuple]] = None, + nms_threshold: float = 0.7, + ): + """ + Converts the output of [`DetaForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.5): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + nms_threshold (`float`, *optional*, defaults to 0.7): + NMS threshold. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + batch_size, num_queries, num_labels = out_logits.shape + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = out_logits.sigmoid() + + all_scores = prob.view(batch_size, num_queries * num_labels).to(out_logits.device) + all_indexes = torch.arange(num_queries * num_labels)[None].repeat(batch_size, 1).to(out_logits.device) + all_boxes = torch.div(all_indexes, out_logits.shape[2], rounding_mode="floor") + all_labels = all_indexes % out_logits.shape[2] + + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, all_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, list): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for b in range(batch_size): + box = boxes[b] + score = all_scores[b] + lbls = all_labels[b] + + pre_topk = score.topk(min(10000, num_queries * num_labels)).indices + box = box[pre_topk] + score = score[pre_topk] + lbls = lbls[pre_topk] + + # apply NMS + keep_inds = batched_nms(box, score, lbls, nms_threshold)[:100] + score = score[keep_inds] + lbls = lbls[keep_inds] + box = box[keep_inds] + + results.append( + { + "scores": score[score > threshold], + "labels": lbls[score > threshold], + "boxes": box[score > threshold], + } + ) + + return results + + +__all__ = ["DetaImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/modeling_deta.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/modeling_deta.py new file mode 100644 index 0000000000000000000000000000000000000000..c4f6f5c65dedc4e7a4d6e5151cff61226ea4a7ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/modeling_deta.py @@ -0,0 +1,2816 @@ +# coding=utf-8 +# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DETA model.""" + +import copy +import math +import os +import warnings +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ....activations import ACT2FN +from ....file_utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_scipy_available, + is_torch_cuda_available, + is_vision_available, + replace_return_docstrings, +) +from ....modeling_attn_mask_utils import _prepare_4d_attention_mask +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutput +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import meshgrid +from ....utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends +from ....utils.backbone_utils import load_backbone +from .configuration_deta import DetaConfig + + +logger = logging.get_logger(__name__) + +MultiScaleDeformableAttention = None + + +def load_cuda_kernels(): + from torch.utils.cpp_extension import load + + global MultiScaleDeformableAttention + + root = Path(__file__).resolve().parent.parent.parent.parent / "kernels" / "deta" + src_files = [ + root / filename + for filename in [ + "vision.cpp", + os.path.join("cpu", "ms_deform_attn_cpu.cpp"), + os.path.join("cuda", "ms_deform_attn_cuda.cu"), + ] + ] + + MultiScaleDeformableAttention = load( + "MultiScaleDeformableAttention", + src_files, + with_cuda=True, + extra_include_paths=[str(root)], + extra_cflags=["-DWITH_CUDA=1"], + extra_cuda_cflags=[ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ], + ) + + +class MultiScaleDeformableAttentionFunction(Function): + @staticmethod + def forward( + context, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + context.im2col_step = im2col_step + output = MultiScaleDeformableAttention.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + context.im2col_step, + ) + context.save_for_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights + ) + return output + + @staticmethod + @once_differentiable + def backward(context, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = context.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + context.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +if is_accelerate_available(): + from accelerate import PartialState + from accelerate.utils import reduce + +if is_vision_available(): + from transformers.image_transforms import center_to_corners_format + +if is_torchvision_available(): + from torchvision.ops.boxes import batched_nms + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DetaConfig" +_CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365" + + +@dataclass +class DetaDecoderOutput(ModelOutput): + """ + Base class for outputs of the DetaDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class DetaModelOutput(ModelOutput): + """ + Base class for outputs of the Deformable DETR encoder-decoder model. + + Args: + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + output_proposals (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of proposal bounding boxes coordinates in the gen_encoder_output_proposals. + """ + + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + output_proposals: Optional[torch.FloatTensor] = None + + +@dataclass +class DetaObjectDetectionOutput(ModelOutput): + """ + Output type of [`DetaForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, + 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average + in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + output_proposals (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of proposal bounding boxes coordinates in the gen_encoder_output_proposals. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + output_proposals: Optional[torch.FloatTensor] = None + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +class DetaFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = DetaFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class DetaBackboneWithPositionalEncodings(nn.Module): + """ + Backbone model with positional embeddings. + + nn.BatchNorm2d layers are replaced by DetaFrozenBatchNorm2d as defined above. + """ + + def __init__(self, config): + super().__init__() + + backbone = load_backbone(config) + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = self.model.channels + + # TODO fix this + if config.backbone_config.model_type == "resnet": + for name, parameter in self.model.named_parameters(): + if "stages.1" not in name and "stages.2" not in name and "stages.3" not in name: + parameter.requires_grad_(False) + + self.position_embedding = build_position_encoding(config) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + """ + Outputs feature maps of latter stages C_3 through C_5 in ResNet if `config.num_feature_levels > 1`, otherwise + outputs feature maps of C_5. + """ + # first, send pixel_values through the backbone to get list of feature maps + features = self.model(pixel_values).feature_maps + + # next, create position embeddings + out = [] + pos = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + position_embeddings = self.position_embedding(feature_map, mask).to(feature_map.dtype) + out.append((feature_map, mask)) + pos.append(position_embeddings) + + return out, pos + + +class DetaSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float() + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class DetaLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = DetaSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = DetaLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +def multi_scale_deformable_attention( + value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor +) -> Tensor: + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +class DetaMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: DetaConfig, num_heads: int, n_points: int): + super().__init__() + + kernel_loaded = MultiScaleDeformableAttention is not None + if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded: + try: + load_cuda_kernels() + except Exception as e: + logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + self._reset_parameters() + + def _reset_parameters(self): + nn.init.constant_(self.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.n_heads, 1, 1, 2) + .repeat(1, self.n_levels, self.n_points, 1) + ) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(self.attention_weights.weight.data, 0.0) + nn.init.constant_(self.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(self.value_proj.weight.data) + nn.init.constant_(self.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(self.output_proj.weight.data) + nn.init.constant_(self.output_proj.bias.data, 0.0) + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + if self.disable_custom_kernels: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + else: + try: + # custom kernel + output = MultiScaleDeformableAttentionFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + except Exception: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + output = self.output_proj(output) + + return output, attention_weights + + +class DetaMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + if attention_mask.dtype == torch.bool: + attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_( + attention_mask, -torch.inf + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class DetaEncoderLayer(nn.Module): + def __init__(self, config: DetaConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = DetaMultiscaleDeformableAttention( + config, + num_heads=config.encoder_attention_heads, + n_points=config.encoder_n_points, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class DetaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DetaConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = DetaMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = DetaMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +class DetaPreTrainedModel(PreTrainedModel): + config: DetaConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + _no_split_modules = [r"DetaBackboneWithPositionalEncodings", r"DetaEncoderLayer", r"DetaDecoderLayer"] + supports_gradient_checkpointing = True + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, DetaLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, DetaMultiscaleDeformableAttention): + module._reset_parameters() + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + +DETA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`DetaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DETA_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +class DetaEncoder(DetaPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`DetaEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: DetaConfig + """ + + def __init__(self, config: DetaConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([DetaEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. Used in decoder. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class DetaDecoder(DetaPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetaDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Deformable DETR: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: DetaConfig + """ + + def __init__(self, config: DetaConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([DetaDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + else: + if reference_points.shape[-1] != 2: + raise ValueError("Reference points' last dimension must be of size 2") + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + if reference_points.shape[-1] == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + if reference_points.shape[-1] != 2: + raise ValueError( + f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}" + ) + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return DetaDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """ + The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without + any specific head on top. + """, + DETA_START_DOCSTRING, +) +class DetaModel(DetaPreTrainedModel): + def __init__(self, config: DetaConfig): + super().__init__(config) + + if config.two_stage: + requires_backends(self, ["torchvision"]) + + # Create backbone with positional encoding + self.backbone = DetaBackboneWithPositionalEncodings(config) + intermediate_channel_sizes = self.backbone.intermediate_channel_sizes + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(intermediate_channel_sizes) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = intermediate_channel_sizes[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + if not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + + self.encoder = DetaEncoder(config) + self.decoder = DetaDecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model) + self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + self.pix_trans = nn.Linear(config.d_model, config.d_model) + self.pix_trans_norm = nn.LayerNorm(config.d_model) + else: + self.reference_points = nn.Linear(config.d_model, 2) + + self.assign_first_stage = config.assign_first_stage + self.two_stage_num_proposals = config.two_stage_num_proposals + + self.post_init() + + def get_encoder(self): + return self.encoder + + def freeze_backbone(self): + for name, param in self.backbone.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask, dtype=torch.float32): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_height = valid_height.to(dtype) / height + valid_ratio_width = valid_width.to(dtype) / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) + return valid_ratio + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float() + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. + padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. + spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + _cur = 0 + level_ids = [] + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_height = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4) + proposals.append(proposal) + _cur += height * width + level_ids.append(grid.new_ones(height * width, dtype=torch.long) * level) + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + level_ids = torch.cat(level_ids) + return object_query, output_proposals, level_ids + + @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetaModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DetaModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DetaModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large-o365") + >>> model = DetaModel.from_pretrained("jozhang97/deta-swin-large-o365", two_stage=False) + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 900, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for level, (source, mask) in enumerate(features): + sources.append(self.input_proj[level](source)) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj[level](features[-1][0]) + else: + source = self.input_proj[level](sources[-1]) + mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + sources.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if not self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + spatial_shapes = [(source.shape[2:]) for source in sources] + source_flatten = [source.flatten(2).transpose(1, 2) for source in sources] + mask_flatten = [mask.flatten(1) for mask in masks] + + lvl_pos_embed_flatten = [] + for level, pos_embed in enumerate(position_embeddings_list): + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + valid_ratios = valid_ratios.float() + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=source_flatten, + attention_mask=mask_flatten, + position_embeddings=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, prepare decoder inputs + batch_size, _, num_channels = encoder_outputs[0].shape + enc_outputs_class = None + enc_outputs_coord_logits = None + output_proposals = None + if self.config.two_stage: + object_query_embedding, output_proposals, level_ids = self.gen_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes + ) + + # hack implementation for two-stage DETA + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.two_stage_num_proposals` proposals + topk = self.two_stage_num_proposals + proposal_logit = enc_outputs_class[..., 0] + + if self.assign_first_stage: + proposal_boxes = center_to_corners_format(enc_outputs_coord_logits.sigmoid().float()).clamp(0, 1) + topk_proposals = [] + for b in range(batch_size): + prop_boxes_b = proposal_boxes[b] + prop_logits_b = proposal_logit[b] + + # pre-nms per-level topk + pre_nms_topk = 1000 + pre_nms_inds = [] + for lvl in range(len(spatial_shapes)): + lvl_mask = level_ids == lvl + pre_nms_inds.append(torch.topk(prop_logits_b.sigmoid() * lvl_mask, pre_nms_topk)[1]) + pre_nms_inds = torch.cat(pre_nms_inds) + + # nms on topk indices + post_nms_inds = batched_nms( + prop_boxes_b[pre_nms_inds], prop_logits_b[pre_nms_inds], level_ids[pre_nms_inds], 0.9 + ) + keep_inds = pre_nms_inds[post_nms_inds] + + if len(keep_inds) < self.two_stage_num_proposals: + print( + f"[WARNING] nms proposals ({len(keep_inds)}) < {self.two_stage_num_proposals}, running" + " naive topk" + ) + keep_inds = torch.topk(proposal_logit[b], topk)[1] + + # keep top Q/L indices for L levels + q_per_l = topk // len(spatial_shapes) + is_level_ordered = ( + level_ids[keep_inds][None] + == torch.arange(len(spatial_shapes), device=level_ids.device)[:, None] + ) + keep_inds_mask = is_level_ordered & (is_level_ordered.cumsum(1) <= q_per_l) # LS + keep_inds_mask = keep_inds_mask.any(0) # S + + # pad to Q indices (might let ones filtered from pre-nms sneak by... unlikely because we pick high conf anyways) + if keep_inds_mask.sum() < topk: + num_to_add = topk - keep_inds_mask.sum() + pad_inds = (~keep_inds_mask).nonzero()[:num_to_add] + keep_inds_mask[pad_inds] = True + + keep_inds_topk = keep_inds[keep_inds_mask] + topk_proposals.append(keep_inds_topk) + topk_proposals = torch.stack(topk_proposals) + else: + topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) + query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + + topk_feats = torch.stack( + [object_query_embedding[b][topk_proposals[b]] for b in range(batch_size)] + ).detach() + target = target + self.pix_trans_norm(self.pix_trans(topk_feats)) + else: + query_embed, target = torch.split(query_embeds, num_channels, dim=1) + query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) + target = target.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + position_embeddings=query_embed, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + + return tuple_outputs + + return DetaModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + output_proposals=output_proposals, + ) + + +@add_start_docstrings( + """ + DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks + such as COCO detection. + """, + DETA_START_DOCSTRING, +) +class DetaForObjectDetection(DetaPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.\d+", r"class_embed\.\d+"] + # We can't initialize the model on meta device as some weights are modified during the initialization + _no_split_modules = None + + def __init__(self, config: DetaConfig): + super().__init__(config) + + # Deformable DETR encoder-decoder model + self.model = DetaModel(config) + + # Detection heads on top + self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.bbox_embed = DetaMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data.fill_(bias_value) + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers + if config.with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # hack implementation for iterative bounding box refinement + self.model.decoder.bbox_embed = self.bbox_embed + else: + nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.model.decoder.bbox_embed = None + if config.two_stage: + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + for box_embed in self.bbox_embed: + nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + + # Initialize weights and apply final processing + self.post_init() + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + aux_loss = [ + {"logits": logits, "pred_boxes": pred_boxes} + for logits, pred_boxes in zip(outputs_class.transpose(0, 1)[:-1], outputs_coord.transpose(0, 1)[:-1]) + ] + return aux_loss + + @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=DetaObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DetaObjectDetectionOutput]: + r""" + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DetaForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large") + >>> model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + ... 0 + ... ] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.802 at location [9.87, 54.36, 316.93, 473.44] + Detected cat with confidence 0.795 at location [346.62, 24.35, 639.62, 373.2] + Detected remote with confidence 0.725 at location [40.41, 73.36, 175.77, 117.29] + Detected remote with confidence 0.638 at location [333.34, 76.81, 370.22, 187.94] + Detected couch with confidence 0.584 at location [0.03, 0.99, 640.02, 474.93] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level](hidden_states[:, level]) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + # Keep batch_size as first dimension + outputs_class = torch.stack(outputs_classes, dim=1) + outputs_coord = torch.stack(outputs_coords, dim=1) + + logits = outputs_class[:, -1] + pred_boxes = outputs_coord[:, -1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = DetaHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = DetaLoss( + matcher=matcher, + num_classes=self.config.num_labels, + focal_alpha=self.config.focal_alpha, + losses=losses, + num_queries=self.config.num_queries, + assign_first_stage=self.config.assign_first_stage, + assign_second_stage=self.config.assign_second_stage, + ) + criterion.to(logits.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + outputs_loss["init_reference"] = init_reference + if self.config.auxiliary_loss: + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + if self.config.two_stage: + enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid() + outputs_loss["enc_outputs"] = { + "logits": outputs.enc_outputs_class, + "pred_boxes": enc_outputs_coord, + "anchors": outputs.output_proposals.sigmoid(), + } + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + aux_weight_dict.update({k + "_enc": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = DetaObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + output_proposals=outputs.output_proposals, + ) + + return dict_outputs + + +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002. + + Args: + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. + + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +class DetaLoss(nn.Module): + """ + This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute + hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched + ground-truth / prediction (supervised class and box). + + Args: + matcher (`DetaHungarianMatcher`): + Module able to compute a matching between targets and proposals. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + focal_alpha (`float`): + Alpha parameter in focal loss. + losses (`list[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + """ + + def __init__( + self, + matcher, + num_classes, + focal_alpha, + losses, + num_queries, + assign_first_stage=False, + assign_second_stage=False, + ): + super().__init__() + self.matcher = matcher + self.num_classes = num_classes + self.focal_alpha = focal_alpha + self.losses = losses + self.assign_first_stage = assign_first_stage + self.assign_second_stage = assign_second_stage + + if self.assign_first_stage: + self.stg1_assigner = DetaStage1Assigner() + if self.assign_second_stage: + self.stg2_assigner = DetaStage2Assigner(num_queries) + + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor + of dim [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + source_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device + ) + target_classes[idx] = target_classes_o + + target_classes_onehot = torch.zeros( + [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], + dtype=source_logits.dtype, + layout=source_logits.layout, + device=source_logits.device, + ) + target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) + + target_classes_onehot = target_classes_onehot[:, :, :-1] + loss_ce = ( + sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + * source_logits.shape[1] + ) + losses = {"loss_ce": loss_ce} + + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_source_permutation_idx(indices) + source_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") + + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + def _get_source_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx + + def _get_target_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`list[dict]`, *optional*): + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k not in ("auxiliary_outputs", "enc_outputs")} + + # Retrieve the matching between the outputs of the last layer and the targets + if self.assign_second_stage: + indices = self.stg2_assigner(outputs_without_aux, targets) + else: + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes across all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + # Check that we have initialized the distributed state + world_size = 1 + if is_accelerate_available(): + if PartialState._shared_state != {}: + num_boxes = reduce(num_boxes) + world_size = PartialState().num_processes + num_boxes = torch.clamp(num_boxes / world_size, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + if not self.assign_second_stage: + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + if "enc_outputs" in outputs: + enc_outputs = outputs["enc_outputs"] + bin_targets = copy.deepcopy(targets) + for bt in bin_targets: + bt["class_labels"] = torch.zeros_like(bt["class_labels"]) + if self.assign_first_stage: + indices = self.stg1_assigner(enc_outputs, bin_targets) + else: + indices = self.matcher(enc_outputs, bin_targets) + for loss in self.losses: + l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) + l_dict = {k + "_enc": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +class DetaMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +class DetaHungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + + Args: + class_cost: + The relative weight of the classification error in the matching cost. + bbox_cost: + The relative weight of the L1 error of the bounding box coordinates in the matching cost. + giou_cost: + The relative weight of the giou loss of the bounding box in the matching cost. + """ + + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + super().__init__() + requires_backends(self, ["scipy"]) + + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") + + @torch.no_grad() + def forward(self, outputs, targets): + """ + Args: + outputs (`dict`): + A dictionary that contains at least these entries: + * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. + targets (`list[dict]`): + A list of targets (len(targets) = batch_size), where each target is a dict containing: + * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of + ground-truth + objects in the target) containing the class labels + * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. + + Returns: + `list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. + alpha = 0.25 + gamma = 2.0 + neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) + class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) + + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] + + return iou - (area - union) / area + + +# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/layers/wrappers.py#L100 +def nonzero_tuple(x): + """ + A 'as_tuple=True' version of torch.nonzero to support torchscript. because of + https://github.com/pytorch/pytorch/issues/38718 + """ + if torch.jit.is_scripting(): + if x.dim() == 0: + return x.unsqueeze(0).nonzero().unbind(1) + return x.nonzero().unbind(1) + else: + return x.nonzero(as_tuple=True) + + +# from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9 +class DetaMatcher: + """ + This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will + have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements. + + The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth, + prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box + intersection-over-union overlap values. + + The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that + matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction. + """ + + def __init__(self, thresholds: list[float], labels: list[int], allow_low_quality_matches: bool = False): + """ + Args: + thresholds (`list[float]`): + A list of thresholds used to stratify predictions into levels. + labels (`list[int`): + A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1} + signifying {ignore, negative class, positive class}, respectively. + allow_low_quality_matches (`bool`, *optional*, defaults to `False`): + If `True`, produce additional matches for predictions with maximum match quality lower than + high_threshold. See `set_low_quality_matches_` for more details. + + For example, + thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and + thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will + be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and + thus will be considered as true positives. + """ + # Add -inf and +inf to first and last position in thresholds + thresholds = thresholds[:] + if thresholds[0] < 0: + raise ValueError("Thresholds should be positive") + thresholds.insert(0, -float("inf")) + thresholds.append(float("inf")) + # Currently torchscript does not support all + generator + if not all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])): + raise ValueError("Thresholds should be sorted.") + if not all(l in [-1, 0, 1] for l in labels): + raise ValueError("All labels should be either -1, 0 or 1") + if len(labels) != len(thresholds) - 1: + raise ValueError("Number of labels should be equal to number of thresholds - 1") + self.thresholds = thresholds + self.labels = labels + self.allow_low_quality_matches = allow_low_quality_matches + + def __call__(self, match_quality_matrix): + """ + Args: + match_quality_matrix (Tensor[float]): an MxN tensor, containing the + pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0 + (due to the us of `torch.nonzero` for selecting indices in `set_low_quality_matches_`). + + Returns: + matches (Tensor[int64]): a vector of length N, where matches[i] is a matched + ground-truth index in [0, M) + match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates + whether a prediction is a true or false positive or ignored + """ + assert match_quality_matrix.dim() == 2 + if match_quality_matrix.numel() == 0: + default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64) + # When no gt boxes exist, we define IOU = 0 and therefore set labels + # to `self.labels[0]`, which usually defaults to background class 0 + # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds + default_match_labels = match_quality_matrix.new_full( + (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8 + ) + return default_matches, default_match_labels + + assert torch.all(match_quality_matrix >= 0) + + # match_quality_matrix is M (gt) x N (predicted) + # Max over gt elements (dim 0) to find best gt candidate for each prediction + matched_vals, matches = match_quality_matrix.max(dim=0) + + match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8) + + for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]): + low_high = (matched_vals >= low) & (matched_vals < high) + match_labels[low_high] = l + + if self.allow_low_quality_matches: + self.set_low_quality_matches_(match_labels, match_quality_matrix) + + return matches, match_labels + + def set_low_quality_matches_(self, match_labels, match_quality_matrix): + """ + Produce additional matches for predictions that have only low-quality matches. Specifically, for each + ground-truth G find the set of predictions that have maximum overlap with it (including ties); for each + prediction in that set, if it is unmatched, then match it to the ground-truth G. + + This function implements the RPN assignment case (i) in Sec. 3.1.2 of :paper:`Faster R-CNN`. + """ + # For each gt, find the prediction with which it has highest quality + highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) + # Find the highest quality match available, even if it is low, including ties. + # Note that the matches qualities must be positive due to the use of + # `torch.nonzero`. + _, pred_inds_with_highest_quality = nonzero_tuple(match_quality_matrix == highest_quality_foreach_gt[:, None]) + # If an anchor was labeled positive only due to a low-quality match + # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B. + # This follows the implementation in Detectron, and is found to have no significant impact. + match_labels[pred_inds_with_highest_quality] = 1 + + +# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/sampling.py#L9 +def subsample_labels(labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int): + """ + Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives & + negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`, + and then try to fill the remaining slots with negatives. + + Args: + labels (Tensor): (N, ) label vector with values: + * -1: ignore + * bg_label: background ("negative") class + * otherwise: one or more foreground ("positive") classes + num_samples (int): The total number of labels with value >= 0 to return. + Values that are not sampled will be filled with -1 (ignore). + positive_fraction (float): The number of subsampled labels with values > 0 + is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is + `min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough + positives, the sample is filled with negatives. If there are also not enough negatives, then as many + elements are sampled as is possible. + bg_label (int): label index of background ("negative") class. + + Returns: + pos_idx, neg_idx (Tensor): + 1D vector of indices. The total length of both is `num_samples` or fewer. + """ + positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0] + negative = nonzero_tuple(labels == bg_label)[0] + + num_pos = int(num_samples * positive_fraction) + # protect against not enough positive examples + num_pos = min(positive.numel(), num_pos) + num_neg = num_samples - num_pos + # protect against not enough negative examples + num_neg = min(negative.numel(), num_neg) + + # randomly select positive and negative examples + perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] + perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] + + pos_idx = positive[perm1] + neg_idx = negative[perm2] + return pos_idx, neg_idx + + +def sample_topk_per_gt(pr_inds, gt_inds, iou, k): + if len(gt_inds) == 0: + return pr_inds, gt_inds + # find topk matches for each gt + gt_inds2, counts = gt_inds.unique(return_counts=True) + scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1) + gt_inds2 = gt_inds2[:, None].repeat(1, k) + + # filter to as many matches that gt has + pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)]) + gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)]) + return pr_inds3, gt_inds3 + + +# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/roi_heads/roi_heads.py#L123 +class DetaStage2Assigner(nn.Module): + def __init__(self, num_queries, max_k=4): + super().__init__() + self.positive_fraction = 0.25 + self.bg_label = 400 # number > 91 to filter out later + self.batch_size_per_image = num_queries + self.proposal_matcher = DetaMatcher(thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True) + self.k = max_k + + def _sample_proposals(self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor): + """ + Based on the matching between N proposals and M groundtruth, sample the proposals and set their classification + labels. + + Args: + matched_idxs (Tensor): a vector of length N, each is the best-matched + gt index in [0, M) for each proposal. + matched_labels (Tensor): a vector of length N, the matcher's label + (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal. + gt_classes (Tensor): a vector of length M. + + Returns: + Tensor: a vector of indices of sampled proposals. Each is in [0, N). Tensor: a vector of the same length, + the classification label for + each sampled proposal. Each sample is labeled as either a category in [0, num_classes) or the + background (num_classes). + """ + has_gt = gt_classes.numel() > 0 + # Get the corresponding GT for each proposal + if has_gt: + gt_classes = gt_classes[matched_idxs] + # Label unmatched proposals (0 label from matcher) as background (label=num_classes) + gt_classes[matched_labels == 0] = self.bg_label + # Label ignore proposals (-1 label) + gt_classes[matched_labels == -1] = -1 + else: + gt_classes = torch.zeros_like(matched_idxs) + self.bg_label + + sampled_fg_idxs, sampled_bg_idxs = subsample_labels( + gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label + ) + + sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0) + return sampled_idxs, gt_classes[sampled_idxs] + + def forward(self, outputs, targets, return_cost_matrix=False): + # COCO categories are from 1 to 90. They set num_classes=91 and apply sigmoid. + + bs = len(targets) + indices = [] + ious = [] + for b in range(bs): + iou, _ = box_iou( + center_to_corners_format(targets[b]["boxes"]), + center_to_corners_format(outputs["init_reference"][b].detach()), + ) + matched_idxs, matched_labels = self.proposal_matcher( + iou + ) # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.6, 0 ow] + ( + sampled_idxs, + sampled_gt_classes, + ) = self._sample_proposals( # list of sampled proposal_ids, sampled_id -> [0, num_classes)+[bg_label] + matched_idxs, matched_labels, targets[b]["class_labels"] + ) + pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label] + pos_gt_inds = matched_idxs[pos_pr_inds] + pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou) + indices.append((pos_pr_inds, pos_gt_inds)) + ious.append(iou) + if return_cost_matrix: + return indices, ious + return indices + + def postprocess_indices(self, pr_inds, gt_inds, iou): + return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k) + + +# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/proposal_generator/rpn.py#L181 +class DetaStage1Assigner(nn.Module): + def __init__(self, t_low=0.3, t_high=0.7, max_k=4): + super().__init__() + self.positive_fraction = 0.5 + self.batch_size_per_image = 256 + self.k = max_k + self.t_low = t_low + self.t_high = t_high + self.anchor_matcher = DetaMatcher( + thresholds=[t_low, t_high], labels=[0, -1, 1], allow_low_quality_matches=True + ) + + def _subsample_labels(self, label): + """ + Randomly sample a subset of positive and negative examples, and overwrite the label vector to the ignore value + (-1) for all elements that are not included in the sample. + + Args: + labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned. + """ + pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0) + # Fill with the ignore label (-1), then set positive and negative labels + label.fill_(-1) + label.scatter_(0, pos_idx, 1) + label.scatter_(0, neg_idx, 0) + return label + + def forward(self, outputs, targets): + bs = len(targets) + indices = [] + for b in range(bs): + anchors = outputs["anchors"][b] + if len(targets[b]["boxes"]) == 0: + indices.append( + ( + torch.tensor([], dtype=torch.long, device=anchors.device), + torch.tensor([], dtype=torch.long, device=anchors.device), + ) + ) + continue + iou, _ = box_iou( + center_to_corners_format(targets[b]["boxes"]), + center_to_corners_format(anchors), + ) + matched_idxs, matched_labels = self.anchor_matcher( + iou + ) # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.7, 0 if iou < 0.3, -1 ow] + matched_labels = self._subsample_labels(matched_labels) + + all_pr_inds = torch.arange(len(anchors), device=matched_labels.device) + pos_pr_inds = all_pr_inds[matched_labels == 1] + pos_gt_inds = matched_idxs[pos_pr_inds] + pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou) + pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(anchors.device) + indices.append((pos_pr_inds, pos_gt_inds)) + return indices + + def postprocess_indices(self, pr_inds, gt_inds, iou): + return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k) + + +__all__ = ["DetaForObjectDetection", "DetaModel", "DetaPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db3d0a634051c713aee33b48f2962647ac48d0eb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_efficientformer import * + from .image_processing_efficientformer import * + from .modeling_efficientformer import * + from .modeling_tf_efficientformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..9aaee649ba047e5967f132eb0b3527dce5855d11 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""EfficientFormer model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class EfficientFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to + instantiate an EfficientFormer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer + [snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`) + Depth of each stage. + hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`) + Dimensionality of each stage. + downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`) + Whether or not to downsample inputs between two stages. + dim (`int`, *optional*, defaults to 448): + Number of channels in Meta3D layers + key_dim (`int`, *optional*, defaults to 32): + The size of the key in meta3D block. + attention_ratio (`int`, *optional*, defaults to 4): + Ratio of the dimension of the query and value to the dimension of the key in MSHA block + resolution (`int`, *optional*, defaults to 7) + Size of each patch + num_hidden_layers (`int`, *optional*, defaults to 5): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the 3D MetaBlock. + mlp_expansion_ratio (`int`, *optional*, defaults to 4): + Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings and encoder. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + pool_size (`int`, *optional*, defaults to 3): + Kernel size of pooling layers. + downsample_patch_size (`int`, *optional*, defaults to 3): + The size of patches in downsampling layers. + downsample_stride (`int`, *optional*, defaults to 2): + The stride of convolution kernels in downsampling layers. + downsample_pad (`int`, *optional*, defaults to 1): + Padding in downsampling layers. + drop_path_rate (`int`, *optional*, defaults to 0): + Rate at which to increase dropout probability in DropPath. + num_meta3d_blocks (`int`, *optional*, defaults to 1): + The number of 3D MetaBlocks in the last stage. + distillation (`bool`, *optional*, defaults to `True`): + Whether to add a distillation head. + use_layer_scale (`bool`, *optional*, defaults to `True`): + Whether to scale outputs from token mixers. + layer_scale_init_value (`float`, *optional*, defaults to 1e-5): + Factor by which outputs from token mixers are scaled. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to `224`): + The size (resolution) of each image. + + Example: + + ```python + >>> from transformers import EfficientFormerConfig, EfficientFormerModel + + >>> # Initializing a EfficientFormer efficientformer-l1 style configuration + >>> configuration = EfficientFormerConfig() + + >>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration + >>> model = EfficientFormerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "efficientformer" + + def __init__( + self, + depths: list[int] = [3, 2, 6, 4], + hidden_sizes: list[int] = [48, 96, 224, 448], + downsamples: list[bool] = [True, True, True, True], + dim: int = 448, + key_dim: int = 32, + attention_ratio: int = 4, + resolution: int = 7, + num_hidden_layers: int = 5, + num_attention_heads: int = 8, + mlp_expansion_ratio: int = 4, + hidden_dropout_prob: float = 0.0, + patch_size: int = 16, + num_channels: int = 3, + pool_size: int = 3, + downsample_patch_size: int = 3, + downsample_stride: int = 2, + downsample_pad: int = 1, + drop_path_rate: float = 0.0, + num_meta3d_blocks: int = 1, + distillation: bool = True, + use_layer_scale: bool = True, + layer_scale_init_value: float = 1e-5, + hidden_act: str = "gelu", + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + image_size: int = 224, + batch_norm_eps: float = 1e-05, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.hidden_sizes = hidden_sizes + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.patch_size = patch_size + self.num_channels = num_channels + self.depths = depths + self.mlp_expansion_ratio = mlp_expansion_ratio + self.downsamples = downsamples + self.dim = dim + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.resolution = resolution + self.pool_size = pool_size + self.downsample_patch_size = downsample_patch_size + self.downsample_stride = downsample_stride + self.downsample_pad = downsample_pad + self.drop_path_rate = drop_path_rate + self.num_meta3d_blocks = num_meta3d_blocks + self.distillation = distillation + self.use_layer_scale = use_layer_scale + self.layer_scale_init_value = layer_scale_init_value + self.image_size = image_size + self.batch_norm_eps = batch_norm_eps + + +__all__ = [ + "EfficientFormerConfig", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..a2dd1281e920fdea3b4c51ef79c09bc86c605921 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py @@ -0,0 +1,324 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for EfficientFormer.""" + +from typing import Optional, Union + +import numpy as np + +from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ....image_transforms import ( + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ....image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_batched, + is_scaled_image, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ....utils import TensorType, logging + + +logger = logging.get_logger(__name__) + + +class EfficientFormerImageProcessor(BaseImageProcessor): + r""" + Constructs a EfficientFormer image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + crop_size: Optional[dict[str, int]] = None, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 224, "width": 224} + size = get_size_dict(size) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self._valid_processor_keys = [ + "images", + "do_resize", + "size", + "resample", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "return_tensors", + "data_format", + "input_data_format", + ] + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample: + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + + if "shortest_edge" in size: + size = get_resize_output_image_size( + image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format + ) + # size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"]) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}") + return resize( + image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + ) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_center_crop: + images = [ + self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["EfficientFormerImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d35d3e82c007c6c6cf9326e1c933df34c6ae41ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py @@ -0,0 +1,786 @@ +# coding=utf-8 +# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch EfficientFormer model.""" + +import itertools +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ....activations import ACT2FN +from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ....modeling_utils import PreTrainedModel +from ....utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_efficientformer import EfficientFormerConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "EfficientFormerConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300" +_EXPECTED_OUTPUT_SHAPE = [1, 49, 448] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" + + +class EfficientFormerPatchEmbeddings(nn.Module): + """ + This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels, + height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride] + """ + + def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True): + super().__init__() + self.num_channels = num_channels + + self.projection = nn.Conv2d( + num_channels, + embed_dim, + kernel_size=config.downsample_patch_size, + stride=config.downsample_stride, + padding=config.downsample_pad, + ) + self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity() + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + + embeddings = self.projection(pixel_values) + embeddings = self.norm(embeddings) + + return embeddings + + +class EfficientFormerSelfAttention(nn.Module): + def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int): + super().__init__() + + self.num_heads = num_heads + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.scale = key_dim**-0.5 + self.total_key_dim = key_dim * num_heads + self.expanded_key_dim = int(attention_ratio * key_dim) + self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads) + hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2 + self.qkv = nn.Linear(dim, hidden_size) + self.projection = nn.Linear(self.total_expanded_key_dim, dim) + points = list(itertools.product(range(resolution), range(resolution))) + num_points = len(points) + attention_offsets = {} + idxs = [] + for point_1 in points: + for point_2 in points: + offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets))) + self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points)) + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and hasattr(self, "ab"): + del self.ab + else: + self.ab = self.attention_biases[:, self.attention_bias_idxs] + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + batch_size, sequence_length, num_channels = hidden_states.shape + qkv = self.qkv(hidden_states) + query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split( + [self.key_dim, self.key_dim, self.expanded_key_dim], dim=3 + ) + query_layer = query_layer.permute(0, 2, 1, 3) + key_layer = key_layer.permute(0, 2, 1, 3) + value_layer = value_layer.permute(0, 2, 1, 3) + + # set `model.to(torch_device)` won't change `self.ab.device`, if there is no follow-up `train` or `eval` call. + # Let's do it manually here, so users won't have to do this everytime. + if not self.training: + self.ab = self.ab.to(self.attention_biases.device) + attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + ( + self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab + ) + + attention_probs = attention_probs.softmax(dim=-1) + + context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2) + context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim) + context_layer = self.projection(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class EfficientFormerConvStem(nn.Module): + def __init__(self, config: EfficientFormerConfig, out_channels: int): + super().__init__() + + self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1) + self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps) + + self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1) + self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps) + + self.activation = nn.ReLU() + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + features = self.batchnorm_before(self.convolution1(pixel_values)) + features = self.activation(features) + features = self.batchnorm_after(self.convolution2(features)) + features = self.activation(features) + + return features + + +class EfficientFormerPooling(nn.Module): + def __init__(self, pool_size: int): + super().__init__() + self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = self.pool(hidden_states) - hidden_states + return output + + +class EfficientFormerDenseMlp(nn.Module): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.linear_in = nn.Linear(in_features, hidden_features) + self.activation = ACT2FN[config.hidden_act] + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.linear_out = nn.Linear(hidden_features, out_features) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.linear_in(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.linear_out(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class EfficientFormerConvMlp(nn.Module): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + drop: float = 0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.convolution1 = nn.Conv2d(in_features, hidden_features, 1) + self.activation = ACT2FN[config.hidden_act] + self.convolution2 = nn.Conv2d(hidden_features, out_features, 1) + self.dropout = nn.Dropout(drop) + + self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps) + self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.convolution1(hidden_state) + hidden_state = self.batchnorm_before(hidden_state) + + hidden_state = self.activation(hidden_state) + hidden_state = self.dropout(hidden_state) + hidden_state = self.convolution2(hidden_state) + + hidden_state = self.batchnorm_after(hidden_state) + hidden_state = self.dropout(hidden_state) + + return hidden_state + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class EfficientFormerDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class EfficientFormerFlat(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + hidden_states = hidden_states.flatten(2).transpose(1, 2) + return hidden_states + + +class EfficientFormerMeta3D(nn.Module): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0): + super().__init__() + + self.token_mixer = EfficientFormerSelfAttention( + dim=config.dim, + key_dim=config.key_dim, + num_heads=config.num_attention_heads, + attention_ratio=config.attention_ratio, + resolution=config.resolution, + ) + + self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps) + + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim) + + self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.use_layer_scale = config.use_layer_scale + if config.use_layer_scale: + self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.use_layer_scale: + layer_output = hidden_states + self.drop_path( + self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output + ) + layer_output = layer_output + self.drop_path( + self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output)) + ) + else: + layer_output = hidden_states + self.drop_path(attention_output) + layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output))) + + outputs = (layer_output,) + outputs + + return outputs + + +class EfficientFormerMeta3DLayers(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:-1])) + for block_idx in range(config.num_meta3d_blocks) + ] + self.blocks = nn.ModuleList( + [EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths] + ) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + all_attention_outputs = () if output_attentions else None + + for layer_module in self.blocks: + if isinstance(hidden_states, tuple): + hidden_states = hidden_states[0] + + hidden_states = layer_module(hidden_states, output_attentions) + + if output_attentions: + all_attention_outputs = all_attention_outputs + (hidden_states[1],) + + if output_attentions: + outputs = (hidden_states[0],) + all_attention_outputs + return outputs + + return hidden_states + + +class EfficientFormerMeta4D(nn.Module): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0): + super().__init__() + pool_size = config.pool_size if config.pool_size is not None else 3 + self.token_mixer = EfficientFormerPooling(pool_size=pool_size) + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = EfficientFormerConvMlp( + config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob + ) + + self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.use_layer_scale = config.use_layer_scale + if config.use_layer_scale: + self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + outputs = self.token_mixer(hidden_states) + + if self.use_layer_scale: + layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs) + + layer_output = layer_output + self.drop_path( + self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output) + ) + else: + layer_output = hidden_states + self.drop_path(outputs) + layer_output = layer_output + self.drop_path(self.mlp(layer_output)) + + return layer_output + + +class EfficientFormerMeta4DLayers(nn.Module): + def __init__(self, config: EfficientFormerConfig, stage_idx: int): + super().__init__() + num_layers = ( + config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks + ) + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers) + ] + + self.blocks = nn.ModuleList( + [ + EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path) + for drop_path in drop_paths + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + for layer_module in self.blocks: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class EfficientFormerIntermediateStage(nn.Module): + def __init__(self, config: EfficientFormerConfig, index: int): + super().__init__() + self.meta4D_layers = EfficientFormerMeta4DLayers(config, index) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + hidden_states = self.meta4D_layers(hidden_states) + return hidden_states + + +class EfficientFormerLastStage(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1) + self.flat = EfficientFormerFlat() + self.meta3D_layers = EfficientFormerMeta3DLayers(config) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + hidden_states = self.meta4D_layers(hidden_states) + hidden_states = self.flat(hidden_states) + hidden_states = self.meta3D_layers(hidden_states, output_attentions) + + return hidden_states + + +class EfficientFormerEncoder(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + self.config = config + num_intermediate_stages = len(config.depths) - 1 + downsamples = [ + config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1] + for i in range(num_intermediate_stages) + ] + intermediate_stages = [] + + for i in range(num_intermediate_stages): + intermediate_stages.append(EfficientFormerIntermediateStage(config, i)) + if downsamples[i]: + intermediate_stages.append( + EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1]) + ) + + self.intermediate_stages = nn.ModuleList(intermediate_stages) + self.last_stage = EfficientFormerLastStage(config) + + def forward( + self, + hidden_states: torch.Tensor, + output_hidden_states: bool = False, + output_attentions: bool = False, + return_dict: bool = True, + ) -> BaseModelOutput: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + for layer_module in self.intermediate_stages: + hidden_states = layer_module(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_output = self.last_stage(hidden_states, output_attentions=output_attentions) + + if output_attentions: + all_self_attentions = all_self_attentions + layer_output[1:] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (layer_output[0],) + + if not return_dict: + return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=layer_output[0], + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class EfficientFormerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: EfficientFormerConfig + base_model_prefix = "efficientformer" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +EFFICIENTFORMER_START_DOCSTRING = r""" + This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a + regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. + + Parameters: + config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +EFFICIENTFORMER_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See + [`ViTImageProcessor.preprocess`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.", + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerModel(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + self.config = config + _no_split_modules = ["EfficientFormerMeta4D"] + + self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0]) + self.encoder = EfficientFormerEncoder(config) + self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.patch_embed(pixel_values) + encoder_outputs = self.encoder( + embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + if not return_dict: + head_outputs = (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final + hidden state of the [CLS] token) e.g. for ImageNet. + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = EfficientFormerModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.efficientformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.classifier(sequence_output.mean(-2)) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@dataclass +class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput): + """ + Output type of [`EfficientFormerForImageClassificationWithTeacher`]. + + Args: + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores as the average of the cls_logits and distillation logits. + cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the + class token). + distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the + distillation token). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + logits: Optional[torch.FloatTensor] = None + cls_logits: Optional[torch.FloatTensor] = None + distillation_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden + state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for + ImageNet. + + + + This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet + supported. + + + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = EfficientFormerModel(config) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + # Distillation head + self.distillation_classifier = ( + nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=EfficientFormerForImageClassificationWithTeacherOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.efficientformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + cls_logits = self.classifier(sequence_output.mean(-2)) + distillation_logits = self.distillation_classifier(sequence_output.mean(-2)) + + # during inference, return the average of both classifier predictions + logits = (cls_logits + distillation_logits) / 2 + + if not return_dict: + output = (logits, cls_logits, distillation_logits) + outputs[1:] + return output + + return EfficientFormerForImageClassificationWithTeacherOutput( + logits=logits, + cls_logits=cls_logits, + distillation_logits=distillation_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "EfficientFormerForImageClassification", + "EfficientFormerForImageClassificationWithTeacher", + "EfficientFormerModel", + "EfficientFormerPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..643097e79c3eab47b62f7e7c4781b2b276418176 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py @@ -0,0 +1,1198 @@ +# coding=utf-8 +# Copyright 2023 Snapchat Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow EfficientFormer model.""" + +import itertools +from dataclasses import dataclass +from typing import Optional, Union + +import tensorflow as tf + +from ....activations_tf import ACT2FN +from ....modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFImageClassifierOutput, +) +from ....modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ....tf_utils import shape_list, stable_softmax +from ....utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_efficientformer import EfficientFormerConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "EfficientFormerConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300" +_EXPECTED_OUTPUT_SHAPE = [1, 49, 448] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300" +_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281" + + +class TFEfficientFormerPatchEmbeddings(keras.layers.Layer): + """ + This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels, + height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride] + """ + + def __init__( + self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs + ) -> None: + super().__init__(**kwargs) + self.num_channels = num_channels + + self.padding = keras.layers.ZeroPadding2D(padding=config.downsample_pad) + self.projection = keras.layers.Conv2D( + filters=embed_dim, + kernel_size=config.downsample_patch_size, + strides=config.downsample_stride, + padding="valid", + name="projection", + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.norm = ( + keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm") + if apply_norm + else tf.identity + ) + self.embed_dim = embed_dim + + def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: + tf.debugging.assert_shapes( + [(pixel_values, (..., None, None, self.num_channels))], + message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", + ) + embeddings = self.projection(self.padding(pixel_values)) + embeddings = self.norm(embeddings, training=training) + return embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, None, self.num_channels]) + if getattr(self, "norm", None) is not None: + if hasattr(self.norm, "name"): + with tf.name_scope(self.norm.name): + self.norm.build([None, None, None, self.embed_dim]) + + +class TFEfficientFormerSelfAttention(keras.layers.Layer): + def __init__( + self, + dim: int, + key_dim: int, + num_heads: int, + attention_ratio: int, + resolution: int, + config: EfficientFormerConfig, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_heads = num_heads + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.scale = key_dim**-0.5 + self.total_key_dim = key_dim * num_heads + self.expanded_key_dim = int(attention_ratio * key_dim) + self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads) + hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2 + + self.qkv = keras.layers.Dense( + units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv" + ) + self.projection = keras.layers.Dense( + units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" + ) + self.resolution = resolution + self.dim = dim + + def build(self, input_shape: tf.TensorShape) -> None: + points = list(itertools.product(range(self.resolution), range(self.resolution))) + num_points = len(points) + attention_offsets = {} + + idxs = [] + + for point_1 in points: + for point_2 in points: + offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + + self.attention_biases = self.add_weight( + shape=(self.num_heads, len(attention_offsets)), + initializer=keras.initializers.zeros(), + trainable=True, + name="attention_biases", + ) + self.attention_bias_idxs = self.add_weight( + shape=(num_points, num_points), + trainable=False, + dtype=tf.int32, + name="attention_bias_idxs", + ) + + self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points))) + + if self.built: + return + self.built = True + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build([None, None, self.dim]) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, self.total_expanded_key_dim]) + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + batch_size, sequence_length, *_ = shape_list(hidden_states) + qkv = self.qkv(inputs=hidden_states) + + query_layer, key_layer, value_layer = tf.split( + tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)), + num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim], + axis=3, + ) + + query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3]) + key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3]) + value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3]) + + attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2])) + scale = tf.cast(self.scale, dtype=attention_probs.dtype) + attention_probs = tf.multiply(attention_probs, scale) + + attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1) + attention_probs = attention_probs + attention_biases + attention_probs = stable_softmax(logits=attention_probs, axis=-1) + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + + context_layer = tf.reshape( + tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim) + ) + context_layer = self.projection(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class TFEfficientFormerConvStem(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs): + super().__init__(**kwargs) + + self.padding = keras.layers.ZeroPadding2D(padding=1) + self.convolution1 = keras.layers.Conv2D( + filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1" + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_before = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before" + ) + + self.convolution2 = keras.layers.Conv2D( + filters=out_channels, + kernel_size=3, + strides=2, + padding="valid", + name="convolution2", + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_after = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" + ) + + self.activation = keras.layers.Activation(activation=keras.activations.relu, name="activation") + self.out_channels = out_channels + self.config = config + + def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: + features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training) + features = self.activation(features) + features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training) + features = self.activation(features) + return features + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution1", None) is not None: + with tf.name_scope(self.convolution1.name): + self.convolution1.build([None, None, None, self.config.num_channels]) + if getattr(self, "batchnorm_before", None) is not None: + with tf.name_scope(self.batchnorm_before.name): + self.batchnorm_before.build([None, None, None, self.out_channels // 2]) + if getattr(self, "convolution2", None) is not None: + with tf.name_scope(self.convolution2.name): + self.convolution2.build([None, None, None, self.out_channels // 2]) + if getattr(self, "batchnorm_after", None) is not None: + with tf.name_scope(self.batchnorm_after.name): + self.batchnorm_after.build([None, None, None, self.out_channels]) + if getattr(self, "activation", None) is not None: + with tf.name_scope(self.activation.name): + self.activation.build(None) + + +class TFEfficientFormerPooling(keras.layers.Layer): + def __init__(self, pool_size: int, **kwargs): + super().__init__(**kwargs) + self.pool = keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same") + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + output = self.pool(hidden_states) + output = output - hidden_states + return output + + +class TFEfficientFormerDenseMlp(keras.layers.Layer): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + **kwargs, + ): + super().__init__(**kwargs) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.linear_in = keras.layers.Dense( + units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in" + ) + self.activation = ACT2FN[config.hidden_act] + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + self.linear_out = keras.layers.Dense( + units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out" + ) + self.hidden_features = hidden_features + self.in_features = in_features + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.linear_in(inputs=hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.linear_out(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "linear_in", None) is not None: + with tf.name_scope(self.linear_in.name): + self.linear_in.build([None, None, self.in_features]) + if getattr(self, "linear_out", None) is not None: + with tf.name_scope(self.linear_out.name): + self.linear_out.build([None, None, self.hidden_features]) + + +class TFEfficientFormerConvMlp(keras.layers.Layer): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + drop: float = 0.0, + **kwargs, + ): + super().__init__(**kwargs) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.convolution1 = keras.layers.Conv2D( + filters=hidden_features, + kernel_size=1, + name="convolution1", + padding="valid", + ) + + self.activation = ACT2FN[config.hidden_act] + + self.convolution2 = keras.layers.Conv2D( + filters=out_features, + kernel_size=1, + name="convolution2", + padding="valid", + ) + + self.dropout = keras.layers.Dropout(rate=drop) + + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_before = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before" + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_after = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" + ) + self.hidden_features = hidden_features + self.in_features = in_features + self.out_features = out_features + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution1(hidden_state) + hidden_state = self.batchnorm_before(hidden_state, training=training) + hidden_state = self.activation(hidden_state) + hidden_state = self.dropout(hidden_state, training=training) + hidden_state = self.convolution2(hidden_state) + hidden_state = self.batchnorm_after(hidden_state, training=training) + hidden_state = self.dropout(hidden_state, training=training) + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution1", None) is not None: + with tf.name_scope(self.convolution1.name): + self.convolution1.build([None, None, None, self.in_features]) + if getattr(self, "convolution2", None) is not None: + with tf.name_scope(self.convolution2.name): + self.convolution2.build([None, None, None, self.hidden_features]) + if getattr(self, "batchnorm_before", None) is not None: + with tf.name_scope(self.batchnorm_before.name): + self.batchnorm_before.build([None, None, None, self.hidden_features]) + if getattr(self, "batchnorm_after", None) is not None: + with tf.name_scope(self.batchnorm_after.name): + self.batchnorm_after.build([None, None, None, self.out_features]) + + +# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer +class TFEfficientFormerDropPath(keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_path: float, **kwargs): + super().__init__(**kwargs) + self.drop_path = drop_path + + def call(self, x: tf.Tensor, training=None): + if training: + keep_prob = 1 - self.drop_path + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + + +class TFEfficientFormerFlat(keras.layers.Layer): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def call(self, hidden_states: tf.Tensor) -> tuple[tf.Tensor]: + batch_size, _, _, in_channels = shape_list(hidden_states) + hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels]) + return hidden_states + + +class TFEfficientFormerMeta3D(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs): + super().__init__(**kwargs) + + self.token_mixer = TFEfficientFormerSelfAttention( + dim=config.dim, + key_dim=config.key_dim, + num_heads=config.num_attention_heads, + attention_ratio=config.attention_ratio, + resolution=config.resolution, + name="token_mixer", + config=config, + ) + self.dim = dim + self.config = config + + self.layernorm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1") + self.layernorm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2") + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp") + + # Using `layers.Activation` instead of `tf.identity` to better control `training' behavior. + self.drop_path = ( + TFEfficientFormerDropPath(drop_path) + if drop_path > 0.0 + else keras.layers.Activation("linear", name="drop_path") + ) + self.config = config + + def build(self, input_shape=None): + self.layer_scale_1 = None + self.layer_scale_2 = None + + if self.config.use_layer_scale: + self.layer_scale_1 = self.add_weight( + shape=(self.dim,), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_1", + ) + self.layer_scale_2 = self.add_weight( + shape=(self.dim,), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_2", + ) + + if self.built: + return + self.built = True + if getattr(self, "token_mixer", None) is not None: + with tf.name_scope(self.token_mixer.name): + self.token_mixer.build(None) + if getattr(self, "layernorm1", None) is not None: + with tf.name_scope(self.layernorm1.name): + self.layernorm1.build([None, None, self.dim]) + if getattr(self, "layernorm2", None) is not None: + with tf.name_scope(self.layernorm2.name): + self.layernorm2.build([None, None, self.dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + self_attention_outputs = self.token_mixer( + hidden_states=self.layernorm1(hidden_states, training=training), + output_attentions=output_attentions, + training=training, + ) + + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.config.use_layer_scale: + layer_output = hidden_states + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output, + training=training, + ) + layer_output = layer_output + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0) + * self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training), + training=training, + ) + else: + layer_output = hidden_states + self.drop_path(attention_output, training=training) + layer_output = layer_output + self.drop_path( + self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training), + training=training, + ) + + outputs = (layer_output,) + outputs + + return outputs + + +class TFEfficientFormerMeta3DLayers(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, **kwargs): + super().__init__(**kwargs) + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:-1])) + for block_idx in range(config.num_meta3d_blocks) + ] + self.blocks = [ + TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}") + for i, drop_path in enumerate(drop_paths) + ] + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + all_attention_outputs = () if output_attentions else None + + for i, layer_module in enumerate(self.blocks): + if isinstance(hidden_states, tuple): + hidden_states = hidden_states[0] + + hidden_states = layer_module( + hidden_states=hidden_states, output_attentions=output_attentions, training=training + ) + if output_attentions: + all_attention_outputs = all_attention_outputs + (hidden_states[1],) + + if output_attentions: + outputs = (hidden_states[0],) + all_attention_outputs + return outputs + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blocks", None) is not None: + for layer in self.blocks: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFEfficientFormerMeta4D(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs): + super().__init__(**kwargs) + pool_size = config.pool_size if config.pool_size is not None else 3 + self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer") + self.dim = dim + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = TFEfficientFormerConvMlp( + config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp" + ) + + self.drop_path = ( + TFEfficientFormerDropPath(drop_path, name="drop_path") + if drop_path > 0.0 + else keras.layers.Activation("linear", name="drop_path") + ) + self.config = config + + def build(self, input_shape=None): + self.layer_scale_1 = None + self.layer_scale_2 = None + + if self.config.use_layer_scale: + self.layer_scale_1 = self.add_weight( + shape=(self.dim), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_1", + ) + self.layer_scale_2 = self.add_weight( + shape=(self.dim), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_2", + ) + + if self.built: + return + self.built = True + if getattr(self, "token_mixer", None) is not None: + with tf.name_scope(self.token_mixer.name): + self.token_mixer.build(None) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]: + outputs = self.token_mixer(hidden_states) + + if self.config.use_layer_scale: + layer_output = hidden_states + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs, + training=training, + ) + + layer_output = layer_output + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0) + * self.mlp(hidden_state=layer_output, training=training), + training=training, + ) + + else: + layer_output = hidden_states + self.drop_path(outputs, training=training) + layer_output = layer_output + self.drop_path( + self.mlp(hidden_state=layer_output, training=training), training=training + ) + + return layer_output + + +class TFEfficientFormerMeta4DLayers(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs): + super().__init__(**kwargs) + num_layers = ( + config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks + ) + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers) + ] + + self.blocks = [ + TFEfficientFormerMeta4D( + config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}" + ) + for i in range(len(drop_paths)) + ] + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]: + for layer_module in self.blocks: + hidden_states = layer_module(hidden_states=hidden_states, training=training) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blocks", None) is not None: + for layer in self.blocks: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFEfficientFormerIntermediateStage(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, index: int, **kwargs): + super().__init__(**kwargs) + self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers") + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]: + hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "meta4D_layers", None) is not None: + with tf.name_scope(self.meta4D_layers.name): + self.meta4D_layers.build(None) + + +class TFEfficientFormerLastStage(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, **kwargs): + super().__init__(**kwargs) + self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers") + self.flat = TFEfficientFormerFlat(name="flat") + self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers") + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training) + hidden_states = self.flat(hidden_states=hidden_states) + hidden_states = self.meta3D_layers( + hidden_states=hidden_states, output_attentions=output_attentions, training=training + ) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "meta4D_layers", None) is not None: + with tf.name_scope(self.meta4D_layers.name): + self.meta4D_layers.build(None) + if getattr(self, "flat", None) is not None: + with tf.name_scope(self.flat.name): + self.flat.build(None) + if getattr(self, "meta3D_layers", None) is not None: + with tf.name_scope(self.meta3D_layers.name): + self.meta3D_layers.build(None) + + +class TFEfficientFormerEncoder(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + num_intermediate_stages = len(config.depths) - 1 + downsamples = [ + config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1] + for i in range(num_intermediate_stages) + ] + + intermediate_stages = [] + layer_count = -1 + for i in range(num_intermediate_stages): + layer_count += 1 + intermediate_stages.append( + TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}") + ) + if downsamples[i]: + layer_count += 1 + intermediate_stages.append( + TFEfficientFormerPatchEmbeddings( + config, + config.hidden_sizes[i], + config.hidden_sizes[i + 1], + name=f"intermediate_stages.{layer_count}", + ) + ) + self.intermediate_stages = intermediate_stages + self.last_stage = TFEfficientFormerLastStage(config, name="last_stage") + + def call( + self, + hidden_states: tf.Tensor, + output_hidden_states: bool, + output_attentions: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutput: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + for layer_module in self.intermediate_stages: + hidden_states = layer_module(hidden_states, training=training) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training) + + if output_attentions: + all_self_attentions = all_self_attentions + layer_output[1:] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (layer_output[0],) + + if not return_dict: + return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=layer_output[0], + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "last_stage", None) is not None: + with tf.name_scope(self.last_stage.name): + self.last_stage.build(None) + for layer in self.intermediate_stages: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFEfficientFormerMainLayer(keras.layers.Layer): + config_class = EfficientFormerConfig + + def __init__(self, config: EfficientFormerConfig, **kwargs) -> None: + super().__init__(**kwargs) + self.config = config + + self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed") + self.encoder = TFEfficientFormerEncoder(config, name="encoder") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + + @unpack_inputs + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[tf.Tensor] = None, + output_hidden_states: Optional[tf.Tensor] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutput, tuple[tf.Tensor, ...]]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # When running on CPU, keras.layers.Conv2D and keras.layers.AveragePool2D do not + # support channels first NCHW format. A number of blocks contain both. + # So change the input format from (batch_size, num_channels, height, width) to + # (batch_size, height, width, num_channels) here. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + embedding_output = self.patch_embed(pixel_values, training=training) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output, training=training) + + # Change the hidden states from (batch_size, height, width, num_channels) to + # (batch_size, num_channels, height, width). + # The hidden states are in (batch_size, height, width, num_channels) + # shape after all stages except the MB3D blocks. + if output_hidden_states: + hidden_states = tuple(tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]) + ( + encoder_outputs[1][-1], + ) + + if not return_dict: + head_outputs = (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "patch_embed", None) is not None: + with tf.name_scope(self.patch_embed.name): + self.patch_embed.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) + + +class TFEfficientFormerPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = EfficientFormerConfig + base_model_prefix = "efficientformer" + main_input_name = "pixel_values" + + +EFFICIENTFORMER_START_DOCSTRING = r""" + This model is a TensorFlow + [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular + TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior. + + + Parameters: + config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +EFFICIENTFORMER_INPUTS_DOCSTRING = r""" + Args: + pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`EfficientFormerImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.", + EFFICIENTFORMER_START_DOCSTRING, +) +class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig, **kwargs) -> None: + super().__init__(config, **kwargs) + + self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[tuple, TFBaseModelOutput]: + outputs = self.efficientformer( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for + ImageNet. + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer") + + # Classifier head + self.classifier = ( + keras.layers.Dense(config.num_labels, name="classifier") + if config.num_labels > 0 + else keras.layers.Activation("linear", name="classifier") + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[tf.Tensor, TFImageClassifierOutput]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.efficientformer( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + + logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2)) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFImageClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + if getattr(self, "classifier", None) is not None: + if hasattr(self.classifier, "name"): + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_sizes[-1]]) + + +@dataclass +class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput): + """ + Args: + Output type of [`EfficientFormerForImageClassificationWithTeacher`]. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores as the average of the cls_logits and distillation logits. + cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the + class token). + distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the + distillation token). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus + the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + logits: Optional[tf.Tensor] = None + cls_logits: Optional[tf.Tensor] = None + distillation_logits: Optional[tf.Tensor] = None + hidden_states: Optional[tuple[tf.Tensor]] = None + attentions: Optional[tuple[tf.Tensor]] = None + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden + state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. + + .. warning:: + This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet + supported. + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer") + + # Classifier heads + self.classifier = ( + keras.layers.Dense(config.num_labels, name="classifier") + if config.num_labels > 0 + else keras.layers.Activation("linear", name="classifier") + ) + self.distillation_classifier = ( + keras.layers.Dense(config.num_labels, name="distillation_classifier") + if config.num_labels > 0 + else keras.layers.Activation("linear", name="distillation_classifier") + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFEfficientFormerForImageClassificationWithTeacherOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if training: + raise Exception( + "This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported." + ) + + outputs = self.efficientformer( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + + cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2)) + distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2)) + logits = (cls_logits + distillation_logits) / 2 + + if not return_dict: + output = (logits, cls_logits, distillation_logits) + outputs[1:] + return output + + return TFEfficientFormerForImageClassificationWithTeacherOutput( + logits=logits, + cls_logits=cls_logits, + distillation_logits=distillation_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + if getattr(self, "classifier", None) is not None: + if hasattr(self.classifier, "name"): + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_sizes[-1]]) + if getattr(self, "distillation_classifier", None) is not None: + if hasattr(self.distillation_classifier, "name"): + with tf.name_scope(self.distillation_classifier.name): + self.distillation_classifier.build([None, None, self.config.hidden_sizes[-1]]) + + +__all__ = [ + "TFEfficientFormerForImageClassification", + "TFEfficientFormerForImageClassificationWithTeacher", + "TFEfficientFormerModel", + "TFEfficientFormerPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2beb8f463ff10af33ea980599ea4d5fc05888acb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2023 The HuggingFace and Baidu Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_ernie_m import * + from .modeling_ernie_m import * + from .tokenization_ernie_m import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py new file mode 100644 index 0000000000000000000000000000000000000000..839b17b11bf7248136668d8c325f03e10a64cd96 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py @@ -0,0 +1,112 @@ +# coding=utf-8 +# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ErnieM model configuration""" +# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py) + +from __future__ import annotations + +from ....configuration_utils import PretrainedConfig + + +class ErnieMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a + Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the `Ernie-M` + [susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture. + + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 250002): + Vocabulary size of `inputs_ids` in [`ErnieMModel`]. Also is the vocab size of token embedding matrix. + Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling + [`ErnieMModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the embedding layer, encoder layers and pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to feed-forward layers are + firstly projected from hidden_size to intermediate_size, and then projected back to hidden_size. Typically + intermediate_size is larger than hidden_size. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the feed-forward layer. `"gelu"`, `"relu"` and any other torch + supported activation functions are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings and encoder. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability used in `MultiHeadAttention` in all encoder layers to drop some attention target. + max_position_embeddings (`int`, *optional*, defaults to 514): + The maximum value of the dimensionality of position encoding, which dictates the maximum supported length + of an input sequence. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the normal initializer for initializing all weight matrices. The index of padding + token in the token vocabulary. + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + act_dropout (`float`, *optional*, defaults to 0.0): + This dropout probability is used in `ErnieMEncoderLayer` after activation. + + A normal_initializer initializes weight matrices as normal distributions. See + `ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`. + """ + + model_type = "ernie_m" + attribute_map: dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"} + + def __init__( + self, + vocab_size: int = 250002, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + attention_probs_dropout_prob: float = 0.1, + max_position_embeddings: int = 514, + initializer_range: float = 0.02, + pad_token_id: int = 1, + layer_norm_eps: float = 1e-05, + classifier_dropout=None, + act_dropout=0.0, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.classifier_dropout = classifier_dropout + self.act_dropout = act_dropout + + +__all__ = ["ErnieMConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py new file mode 100644 index 0000000000000000000000000000000000000000..f0e97c132d0989079aa647cac4e64bbae451a116 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py @@ -0,0 +1,1061 @@ +# coding=utf-8 +# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ErnieM model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn, tensor +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from ....utils.deprecation import deprecate_kwarg +from .configuration_ernie_m import ErnieMConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch" +_CONFIG_FOR_DOC = "ErnieMConfig" +_TOKENIZER_FOR_DOC = "ErnieMTokenizer" + + +# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings +class ErnieMEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id + ) + self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(p=config.hidden_dropout_prob) + self.padding_idx = config.pad_token_id + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + if position_ids is None: + input_shape = inputs_embeds.size()[:-1] + ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device) + seq_length = torch.cumsum(ones, dim=1) + position_ids = seq_length - ones + + if past_key_values_length > 0: + position_ids = position_ids + past_key_values_length + # to mimic paddlenlp implementation + position_ids += 2 + position_embeddings = self.position_embeddings(position_ids) + embeddings = inputs_embeds + position_embeddings + embeddings = self.layer_norm(embeddings) + embeddings = self.dropout(embeddings) + + return embeddings + + +class ErnieMSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.q_proj = nn.Linear(config.hidden_size, self.all_head_size) + self.k_proj = nn.Linear(config.hidden_size, self.all_head_size) + self.v_proj = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + mixed_query_layer = self.q_proj(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_values is not None: + # reuse k,v, cross_attentions + key_layer = past_key_values[0] + value_layer = past_key_values[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_values is not None: + key_layer = self.transpose_for_scores(self.k_proj(hidden_states)) + value_layer = self.transpose_for_scores(self.v_proj(hidden_states)) + key_layer = torch.cat([past_key_values[0], key_layer], dim=2) + value_layer = torch.cat([past_key_values[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.k_proj(hidden_states)) + value_layer = self.transpose_for_scores(self.v_proj(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_values is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_values,) + return outputs + + +class ErnieMAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type) + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self_attn.num_attention_heads, self.self_attn.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self_attn.q_proj = prune_linear_layer(self.self_attn.q_proj, index) + self.self_attn.k_proj = prune_linear_layer(self.self_attn.k_proj, index) + self.self_attn.v_proj = prune_linear_layer(self.self_attn.v_proj, index) + self.out_proj = prune_linear_layer(self.out_proj, index, dim=1) + + # Update hyper params and store pruned heads + self.self_attn.num_attention_heads = self.self_attn.num_attention_heads - len(heads) + self.self_attn.all_head_size = self.self_attn.attention_head_size * self.self_attn.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self_attn( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + output_attentions, + ) + attention_output = self.out_proj(self_outputs[0]) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class ErnieMEncoderLayer(nn.Module): + def __init__(self, config): + super().__init__() + # to mimic paddlenlp implementation + dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob + act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout + + self.self_attn = ErnieMAttention(config) + self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.dropout = nn.Dropout(act_dropout) + self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size) + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = True, + ): + residual = hidden_states + if output_attentions: + hidden_states, attention_opt_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + ) + + else: + hidden_states = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + ) + hidden_states = residual + self.dropout1(hidden_states) + hidden_states = self.norm1(hidden_states) + residual = hidden_states + + hidden_states = self.linear1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.linear2(hidden_states) + hidden_states = residual + self.dropout2(hidden_states) + hidden_states = self.norm2(hidden_states) + + if output_attentions: + return hidden_states, attention_opt_weights + else: + return hidden_states + + +class ErnieMEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + input_embeds: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None + + output = input_embeds + if output_hidden_states: + hidden_states = hidden_states + (output,) + for i, layer in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + + output, opt_attn_weights = layer( + hidden_states=output, + attention_mask=attention_mask, + head_mask=layer_head_mask, + past_key_values=past_key_values[i] if past_key_values is not None else None, + ) + + if output_hidden_states: + hidden_states = hidden_states + (output,) + if output_attentions: + attentions = attentions + (opt_attn_weights,) + + last_hidden_state = output + if not return_dict: + return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions + ) + + +class ErnieMPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class ErnieMPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: ErnieMConfig + base_model_prefix = "ernie_m" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +ERNIE_M_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ERNIE_M_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.", + ERNIE_M_START_DOCSTRING, +) +class ErnieMModel(ErnieMPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.initializer_range = config.initializer_range + self.embeddings = ErnieMEmbeddings(config) + self.encoder = ErnieMEncoder(config) + self.pooler = ErnieMPooler(config) if add_pooling_layer else None + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layers[layer].self_attn.prune_heads(heads) + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[tensor] = None, + position_ids: Optional[tensor] = None, + attention_mask: Optional[tensor] = None, + head_mask: Optional[tensor] = None, + inputs_embeds: Optional[tensor] = None, + past_key_values: Optional[tuple[tuple[tensor]]] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]: + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.") + + # init the default bool value + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values.get_seq_length() + + # Adapted from paddlenlp.transformers.ernie_m.ErnieMModel + if attention_mask is None: + attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32) + attention_mask *= torch.finfo(attention_mask.dtype).min + if past_key_values is not None: + batch_size = past_key_values[0][0].shape[0] + past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) + attention_mask = torch.concat([past_mask, attention_mask], dim=-1) + # For 2D attention_mask from tokenizer + elif attention_mask.ndim == 2: + attention_mask = attention_mask.to(torch.float32) + attention_mask = 1.0 - attention_mask + attention_mask *= torch.finfo(attention_mask.dtype).min + + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + sequence_output = encoder_outputs[0] + pooler_output = self.pooler(sequence_output) if self.pooler is not None else None + return (sequence_output, pooler_output) + encoder_outputs[1:] + + sequence_output = encoder_outputs["last_hidden_state"] + pooler_output = self.pooler(sequence_output) if self.pooler is not None else None + hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"] + attentions = None if not output_attentions else encoder_outputs["attentions"] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooler_output, + hidden_states=hidden_states, + attentions=attentions, + ) + + +@add_start_docstrings( + """ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForSequenceClassification(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.ernie_m = ErnieMModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = True, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieM Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForMultipleChoice(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.ernie_m = ErnieMModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.FloatTensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieM Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForTokenClassification(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.ernie_m = ErnieMModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = True, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.FloatTensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForQuestionAnswering(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.ernie_m = ErnieMModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to + compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForInformationExtraction(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.ernie_m = ErnieMModel(config) + self.linear_start = nn.Linear(config.hidden_size, 1) + self.linear_end = nn.Linear(config.hidden_size, 1) + self.sigmoid = nn.Sigmoid() + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for position (index) for computing the start_positions loss. Position outside of the sequence are + not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not + taken into account for computing the loss. + """ + + result = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + if return_dict: + sequence_output = result.last_hidden_state + elif not return_dict: + sequence_output = result[0] + + start_logits = self.linear_start(sequence_output) + start_logits = start_logits.squeeze(-1) + end_logits = self.linear_end(sequence_output) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = BCEWithLogitsLoss() + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + return tuple( + i + for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions] + if i is not None + ) + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=result.hidden_states, + attentions=result.attentions, + ) + + +__all__ = [ + "ErnieMForMultipleChoice", + "ErnieMForQuestionAnswering", + "ErnieMForSequenceClassification", + "ErnieMForTokenClassification", + "ErnieMModel", + "ErnieMPreTrainedModel", + "ErnieMForInformationExtraction", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py new file mode 100644 index 0000000000000000000000000000000000000000..4c33abd043e6d170485e6d01298788d461275aa6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py @@ -0,0 +1,409 @@ +# coding=utf-8 +# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Ernie-M.""" + +import os +import unicodedata +from typing import Any, Optional + +import sentencepiece as spm + +from ....tokenization_utils import PreTrainedTokenizer +from ....utils import logging +from ....utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"} + +RESOURCE_FILES_NAMES = { + "sentencepiece_model_file": "sentencepiece.bpe.model", + "vocab_file": "vocab.txt", +} + + +# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer +@requires(backends=("sentencepiece",)) +class ErnieMTokenizer(PreTrainedTokenizer): + r""" + Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words. + + Args: + sentencepiece_model_file (`str`): + The file path of sentencepiece model. + vocab_file (`str`, *optional*): + The file path of the vocabulary. + do_lower_case (`str`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be + `unk_token` inorder to be converted to an ID. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + A special token separating two different sentences in the same input. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + A special token used to make arrays of tokens the same size for batching purposes. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + A special token used for sequence classification. It is the last token of the sequence when built with + special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + A special token representing a masked token. This is the token used in the masked language modeling task + which the model tries to predict the original unmasked ones. + """ + + # Ernie-M model doesn't have token_type embedding. + model_input_names: list[str] = ["input_ids"] + + vocab_files_names = VOCAB_FILES_NAMES + resource_files_names = RESOURCE_FILES_NAMES + + def __init__( + self, + sentencepiece_model_ckpt, + vocab_file=None, + do_lower_case=False, + encoding="utf8", + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.do_lower_case = do_lower_case + self.sentencepiece_model_ckpt = sentencepiece_model_ckpt + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(sentencepiece_model_ckpt) + + # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning + if vocab_file is not None: + self.vocab = self.load_vocab(filepath=vocab_file) + else: + self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())} + self.reverse_vocab = {v: k for k, v in self.vocab.items()} + + super().__init__( + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + vocab_file=vocab_file, + encoding=encoding, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def get_offset_mapping(self, text): + if text is None: + return None + + split_tokens = self.tokenize(text) + normalized_text, char_mapping = "", [] + + for i, ch in enumerate(text): + if ch in self.SP_CHAR_MAPPING: + ch = self.SP_CHAR_MAPPING.get(ch) + else: + ch = unicodedata.normalize("NFKC", ch) + if self.is_whitespace(ch): + continue + normalized_text += ch + char_mapping.extend([i] * len(ch)) + + text, token_mapping, offset = normalized_text, [], 0 + + if self.do_lower_case: + text = text.lower() + + for token in split_tokens: + if token[:1] == "▁": + token = token[1:] + start = text[offset:].index(token) + offset + end = start + len(token) + + token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1)) + offset = end + return token_mapping + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.sentencepiece_model_ckpt) + + def clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + return "".join(self.SP_CHAR_MAPPING.get(c, c) for c in text) + + def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1): + """Tokenize a string.""" + + if self.sp_model_kwargs.get("enable_sampling") is True: + enable_sampling = True + if self.sp_model_kwargs.get("alpha") is not None: + alpha = self.sp_model_kwargs.get("alpha") + if self.sp_model_kwargs.get("nbest_size") is not None: + nbest_size = self.sp_model_kwargs.get("nbest_size") + + if not enable_sampling: + pieces = self.sp_model.EncodeAsPieces(text) + else: + pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha) + new_pieces = [] + for pi, piece in enumerate(pieces): + if piece == SPIECE_UNDERLINE: + if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0: + new_pieces.append(SPIECE_UNDERLINE) + continue + else: + continue + lst_i = 0 + for i, chunk in enumerate(piece): + if chunk == SPIECE_UNDERLINE: + continue + if self.is_ch_char(chunk) or self.is_punct(chunk): + if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE: + new_pieces.append(piece[lst_i:i]) + new_pieces.append(chunk) + lst_i = i + 1 + elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit(): + if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE: + new_pieces.append(piece[lst_i:i]) + lst_i = i + elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit(): + if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE: + new_pieces.append(piece[lst_i:i]) + lst_i = i + if len(piece) > lst_i: + new_pieces.append(piece[lst_i:]) + return new_pieces + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def convert_ids_to_string(self, ids): + """ + Converts a sequence of tokens (strings for sub-words) in a single string. + """ + tokens = self.convert_ids_to_tokens(ids) + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning + def _convert_token_to_id(self, token): + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.reverse_vocab.get(index, self.unk_token) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + r""" + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An ErnieM sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `list[int]`: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + _cls = [self.cls_token_id] + _sep = [self.sep_token_id] + return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep + + def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): + r""" + Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M + offset_mapping has the following format: + + - single sequence: `(0,0) X (0,0)` + - pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)` + + Args: + offset_mapping_ids_0 (`list[tuple]`): + List of char offsets to which the special tokens will be added. + offset_mapping_ids_1 (`list[tuple]`, *optional*): + Optional second list of wordpiece offsets for offset mapping pairs. + Returns: + `list[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens. + """ + if offset_mapping_1 is None: + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + + return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)] + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + r""" + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `encode` method. + + Args: + token_ids_0 (`list[int]`): + List of ids of the first sequence. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`str`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + `list[int]`: + The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0] + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create the token type IDs corresponding to the sequences passed. [What are token type + IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of + building: those. + + Args: + token_ids_0 (`list[int]`): + The first tokenized sequence. + token_ids_1 (`list[int]`, *optional*): + The second tokenized sequence. + Returns: + `list[int]`: The token type ids. + """ + # called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method + if token_ids_1 is None: + # [CLS] X [SEP] + return (len(token_ids_0) + 2) * [0] + + # [CLS] A [SEP] [SEP] B [SEP] + return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3) + + def is_ch_char(self, char): + """ + is_ch_char + """ + if "\u4e00" <= char <= "\u9fff": + return True + return False + + def is_alpha(self, char): + """ + is_alpha + """ + if ("a" <= char <= "z") or ("A" <= char <= "Z"): + return True + return False + + def is_punct(self, char): + """ + is_punct + """ + if char in ",;:.?!~,;:。?!《》【】": + return True + return False + + def is_whitespace(self, char): + """ + is whitespace + """ + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + if len(char) == 1: + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + def load_vocab(self, filepath): + token_to_idx = {} + with open(filepath, "r", encoding="utf-8") as f: + for index, line in enumerate(f): + token = line.rstrip("\n") + token_to_idx[token] = int(index) + + return token_to_idx + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + + tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model") + with open(tokenizer_model_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (vocab_file,) + + +__all__ = ["ErnieMTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3c23b58f35012f1ddc00e2275c484810287de6f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_gptsan_japanese import * + from .modeling_gptsan_japanese import * + from .tokenization_gptsan_japanese import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..a7d1e23e080eddc5cce28e72c551d123bec09de6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py @@ -0,0 +1,157 @@ +# coding=utf-8 +# Copyright 2023, HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GPTSAN-japanese model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class GPTSanJapaneseConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate + a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese + [Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Arguments: + vocab_size (`int`, *optional*, defaults to 36000): + Vocabulary size of the GPTSANJapanese model. Defines the number of different tokens that can be represented + by the `inputs_ids` passed when calling [`GPTSanJapaneseModel`]. + max_position_embeddings (`int`, *optional*, defaults to 1280): + The maximum sequence length that this model might ever be used with. Defaults set this to 1280. + d_model (`int`, *optional*, defaults to 1024): + Size of the encoder layers and the pooler layer. + d_ff (`int`, *optional*, defaults to 8192): + Size of the intermediate feed forward layer in each `SwitchTransformersBlock`. + d_ext (`int`, *optional*, defaults to 4096): + Size of the intermediate feed forward layer in each Extra-layers. + d_spout (`int`, *optional*, defaults to 128): + Size of the `spout` vector. + num_switch_layers (`int`, *optional*, defaults to 10): + Number of layers in the Switch Transformer layer. + num_ext_layers (`int`, *optional*, defaults to 0): + Number of layers in the Extra-layers. + num_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + num_experts (`int`, *optional*, defaults to 16): + Number of experts for each SwitchTransformer layer. + expert_capacity (`int`, *optional*, defaults to 128): + Number of tokens that can be stored in each expert. If set to 1, the model will behave like a regular + Transformer. + dropout_rate (`float`, *optional*, defaults to 0.0): + The ratio for all dropout layers. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + router_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the router. + router_jitter_noise (`float`, *optional*, defaults to 0.0): + Amount of noise to add to the router. Set it to 0.0 during prediction or set small value (usually 1e-2) + during training. + router_dtype (`str`, *optional*, default to `"float32"`): + The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the + *selective precision* discussion in [the paper](https://huggingface.co/papers/2101.03961). + router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`): + Whether to ignore padding tokens when routing. + output_hidden_states (`bool`, *optional*, default to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. + initializer_factor (`float`, *optional*, defaults to 0.002): + A factor for initializing all weight matrices. + output_router_logits (`bool`, *optional*, default to `False`): + Whether or not to return the router logits of all experts. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models) + """ + + model_type = "gptsan-japanese" + keys_to_ignore_at_inference = [ + "past_key_values", + ] + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + vocab_size=36000, + max_position_embeddings=1280, + d_model=1024, + d_ff=8192, + d_ext=4096, + d_spout=128, + num_switch_layers=10, + num_ext_layers=0, + num_heads=16, + num_experts=16, + expert_capacity=128, + dropout_rate=0.0, + layer_norm_epsilon=1e-5, + router_bias=False, + router_jitter_noise=0.0, + router_dtype="float32", + router_ignore_padding_tokens=False, + output_hidden_states=False, + output_attentions=False, + initializer_factor=0.002, + output_router_logits=False, + use_cache=True, + separator_token_id=35998, + pad_token_id=35995, + eos_token_id=35999, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.d_ff = d_ff + self.d_ext = d_ext + self.d_spout = d_spout + self.num_switch_layers = num_switch_layers + self.num_ext_layers = num_ext_layers + self.num_layers = num_switch_layers + num_ext_layers + self.num_heads = num_heads + self.num_experts = num_experts + self.expert_capacity = expert_capacity + self.dropout_rate = dropout_rate + self.layer_norm_epsilon = layer_norm_epsilon + self.router_bias = router_bias + self.router_jitter_noise = router_jitter_noise + self.router_dtype = router_dtype + self.router_ignore_padding_tokens = router_ignore_padding_tokens + self.output_hidden_states = output_hidden_states + self.output_attentions = output_attentions + self.initializer_factor = initializer_factor + self.output_router_logits = output_router_logits + self.use_cache = use_cache + + super().__init__( + separator_token_id=separator_token_id, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + +__all__ = ["GPTSanJapaneseConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..0e2d27f03bacbd60ade444fe3221ada9f714cb60 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py @@ -0,0 +1,1331 @@ +# coding=utf-8 +# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch GPTSANJapanese model.""" + +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions +from ....modeling_utils import PreTrainedModel +from ....utils import ( + DUMMY_INPUTS, + DUMMY_MASK, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_torch_fx_proxy, + logging, +) +from ....utils.deprecation import deprecate_kwarg +from .configuration_gptsan_japanese import GPTSanJapaneseConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "GPTSanJapaneseConfig" +_CHECKPOINT_FOR_DOC = "Tanrei/GPTSAN-japanese" + +#################################################### +# This dict contains ids and associated url +# for the pretrained weights provided with the models +#################################################### + + +def router_z_loss_func(router_logits: torch.Tensor) -> float: + r""" + Compute the router z-loss implemented in PyTorch. + + The router z-loss was introduced in [Designing Effective Sparse Expert Models](https://huggingface.co/papers/2202.08906). + It encourages router logits to remain small in an effort to improve stability. + + Args: + router_logits (`float`): + Input logits of shape [batch_size, sequence_length, num_experts] + + Returns: + Scalar router z-loss. + """ + num_groups, tokens_per_group, _ = router_logits.shape + log_z = torch.logsumexp(router_logits, dim=-1) + z_loss = log_z**2 + return torch.sum(z_loss) / (num_groups * tokens_per_group) + + +def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + router_probs (`torch.Tensor`): + Probability assigned to each expert per token. Shape: [batch_size, sequence_length, num_experts]. + expert_indices (`torch.Tensor`): + Indices tensor of shape [batch_size, sequence_length] identifying the selected expert for a given token. + + Returns: + The auxiliary loss. + """ + num_experts = router_probs.shape[-1] + + # cast the expert indices to int64, otherwise one-hot encoding will fail + if expert_indices.dtype != torch.int64: + expert_indices = expert_indices.to(torch.int64) + + if len(expert_indices.shape) == 2: + expert_indices = expert_indices.unsqueeze(2) + + expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts) + + # For a given token, determine if it was routed to a given expert. + expert_mask = torch.max(expert_mask, axis=-2).values + + # cast to float32 otherwise mean will fail + expert_mask = expert_mask.to(torch.float32) + tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2) + + router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2) + return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2) + + +class GPTSanJapaneseDenseActDense(nn.Module): + """ + FFN Layer for Switch Transformer and Extra layers + + GPTSAN can mix Switch Transformer layers and normal Transformer layers This class is used as Expert in Switch + Transformer layers and as FFN in regular Transformer layers. RELU is used in the Switch Transformer layer, and + Swish is used in the normal Transformer layer, so there is a choice of which is used in the argument. + + """ + + def __init__(self, config: GPTSanJapaneseConfig, ext_layer=False): + super().__init__() + d_inter = config.d_ext if ext_layer else config.d_ff + self.wi = nn.Linear(config.d_model, d_inter, bias=ext_layer) + self.wo = nn.Linear(d_inter, config.d_model, bias=ext_layer) + self.dropout = nn.Identity() if ext_layer else nn.Dropout(config.dropout_rate) + self.act = ACT2FN["swish" if ext_layer else "relu"] + + def forward(self, hidden_states): + r""" + Args: + hidden_states (`torch.Tensor`) : + [num_groups, tokens_per_group, hidden_dim] inputs to send to experts. + Returns: + torch.Tensor[num_groups, tokens_per_group, hidden_dim] + + """ + hidden_states = self.wi(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + + +class GPTSanJapaneseTop1Router(nn.Module): + """ + Router using tokens choose top-1 experts assignment. + + This router uses the same mechanism as in Switch Transformer (https://huggingface.co/papers/2101.03961) and V-MoE + (https://huggingface.co/papers/2106.05974): tokens choose their top experts. Items are sorted by router_probs and then + routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee that each + token is processed by an expert**, or that each expert receives at least one token. + + """ + + def __init__(self, config: GPTSanJapaneseConfig): + super().__init__() + self.num_experts = config.num_experts + self.expert_capacity = config.expert_capacity + self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias) + self.jitter_noise = config.router_jitter_noise + self.ignore_padding_tokens = config.router_ignore_padding_tokens + self.dtype = getattr(torch, config.router_dtype) + + def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + r""" + Computes router probabilities from input hidden states. + + Args: + hidden_states (`torch.Tensor`): + (batch_size, sequence_length, hidden_dim) from which router probabilities are computed. + Returns: + router_probabilities (`torch.Tensor`): + Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each + token and expert. Used for routing tokens to experts. + router_logits (`torch.Tensor`): + Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits. + This is used later for computing router z-loss. + """ + # float32 is used to ensure stability. See the discussion of "selective precision" in + # https://huggingface.co/papers/2101.03961. + # We also store the previous dtype to cast back the output to the previous dtype + self.input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(self.dtype) + + if self.training and self.jitter_noise > 0: + # Multiply the token inputs by the uniform distribution - adding some noise + hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise) + + # Shape: [num_groups, tokens_per_group, num_experts] + self._cast_classifier() + router_logits = self.classifier(hidden_states) + + # Apply Softmax and cast back to the original `dtype` + router_probabilities = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype) + return router_probabilities, router_logits + + def _cast_classifier(self): + r""" + `bitsandbytes` `Linear8bitLt` layers does not support manual casting Therefore we need to check if they are an + instance of the `Linear8bitLt` class by checking special attributes. + """ + if not (hasattr(self.classifier, "SCB") or hasattr(self.classifier, "CB")): + self.classifier = self.classifier.to(self.dtype) + + def forward(self, hidden_states: torch.Tensor) -> tuple: + r""" + Generic forward function for every Router class. Each Router expects to have the same input hidden states + (`hidden_states`) corresponding to the hidden states for each token, the `expert_capacity` corresponding to the + number of tokens the Router will send to each expert, some Routers can send up to few tokens to each expert. + + Each Router works as the following: it expects the hidden states for each token, gets the `router_probs` and + `router_logits` from the `router_weights`. This will assign for each token, the raw probability to be assigned + to an expert. Then each Router class will have to define its own `_compute_routing_instructions`. + + Args: + hidden_states (`torch.Tensor`) : + [num_groups, tokens_per_group, hidden_dim] inputs to send to experts. + Returns: + tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`] Tuple containing the expert index, the router probs + and the router logits. The router probabilities and logits are required to compute the loss. + """ + router_probs, router_logits = self._compute_router_probabilities(hidden_states) + + expert_index = torch.argmax(router_probs, dim=-1) + expert_index = torch.nn.functional.one_hot(expert_index, num_classes=self.num_experts) + + # Mask tokens outside expert capacity. Sum over each sequence + token_priority = torch.cumsum(expert_index, dim=-2) + # mask if the token routed to to the expert will overflow + expert_capacity_mask = token_priority <= self.expert_capacity + expert_index = expert_index * expert_capacity_mask + + router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1) + return expert_index, router_probs, router_logits + + +class GPTSanJapaneseSparseMLP(nn.Module): + r""" + Implementation of the Switch Transformers Sparse MLP module. + """ + + def __init__(self, config: GPTSanJapaneseConfig, expert_class: nn.Module = GPTSanJapaneseDenseActDense): + super().__init__() + # Step 1: Get the correct router according to its class + self.router = GPTSanJapaneseTop1Router(config) + + # Step 2: Get the experts + self.experts = nn.ModuleDict() + for idx in range(config.num_experts): + self.experts[f"expert_{idx}"] = expert_class(config) + + def forward(self, hidden_states): + r""" + Hold on, this will be slightly tricky to understand In the correct order, a MoE layer does the following: + + 1- Gets the `router_mask` from the router. The shape of the mask is `(batch_size, sequence_length, num_expert)` + and corresponds to the argmax of the `router_probs`. The probabilities are needed in the computation of the + hidden states : they are broadcasted to the hidden states values (can be interpreted as a scaling factor). + + 2- Dispatch the tokens to its associated experts. We do a classic for loop over the experts and assign for each + expert the corresponding hidden states. + + """ + # Step 1: Get the router_mask from the router as well as the probabilities + router_mask, router_probs, router_logits = self.router(hidden_states) + expert_index = torch.argmax(router_mask, dim=-1) + + # The routers introduced might not always map all the tokens, to a router, which means that some hidden states + # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones. + + next_states = hidden_states.clone() + for idx, expert in enumerate(self.experts.values()): + token_indices = router_mask[:, :, idx].bool() + next_states[token_indices] = expert(hidden_states[token_indices]).to(next_states.dtype) + + hidden_states = router_probs * next_states + return hidden_states, (router_logits, expert_index) + + +class GPTSanJapaneseLayerSparseFF(nn.Module): + r""" + Switch Transformers Feed Forward layer module. This is a wrapper around the Mixture of Experts module. + + Parameters: + config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + """ + + def __init__(self, config: GPTSanJapaneseConfig): + super().__init__() + self.mlp = GPTSanJapaneseSparseMLP(config) + self.soft_bypass_mlp = nn.Linear(config.d_model, config.d_model, bias=False) + self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + + def forward(self, hidden_states, output_router_logits): + r""" + Args: + hidden_states (`torch.Tensor`) : + [num_groups, tokens_per_group, hidden_dim] inputs to send to experts. + output_router_logits (`bool`) : + output experts router output. + Returns: + torch.Tensor[num_groups, tokens_per_group, hidden_dim] + + """ + forwarded_states, router_tuple = self.mlp(hidden_states) + forwarded_states += torch.tanh(self.soft_bypass_mlp(hidden_states)) + output = hidden_states + self.norm(forwarded_states) + + if output_router_logits and router_tuple is not None: + return output, router_tuple + else: + return output + + +class GPTSanJapaneseLayerDenseFF(nn.Module): + r""" + Extra Transformers Feed Forward layer module. + + Parameters: + config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + """ + + def __init__(self, config: GPTSanJapaneseConfig): + super().__init__() + # Check if it is a sparse layer, if not then it is a dense layer + self.mlp = GPTSanJapaneseDenseActDense(config, ext_layer=True) + self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + + def forward(self, hidden_states): + r""" + Args: + hidden_states (`torch.Tensor`) : + [num_groups, tokens_per_group, hidden_dim] inputs to send to experts. + Returns: + torch.Tensor[num_groups, tokens_per_group, hidden_dim] + + """ + forwarded_states = self.mlp(hidden_states) + output = hidden_states + self.norm(forwarded_states) + return output + + +class GPTSanJapaneseAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[GPTSanJapaneseConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_values[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_values` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_values is not None + and past_key_values[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_values[0] + value_states = past_key_values[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_values is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_values[0], key_states], dim=2) + value_states = torch.cat([past_key_values[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_values + + +class GPTSanJapaneseLayerSelfAttention(nn.Module): + """ + Self Attention and Normalization Unit + """ + + def __init__(self, config, has_relative_attention_bias=False): + super().__init__() + self.self_attn = GPTSanJapaneseAttention( + embed_dim=config.d_model, + num_heads=config.num_heads, + is_decoder=True, + bias=has_relative_attention_bias, + ) + self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]: + r""" + Self-attention and normalize block. + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up + decoding. If `past_key_values` are used, the user can optionally input only the last + `decoder_input_ids` (those that don't have their past key value states given to this model) of shape + `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + Returns: + tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...] + """ + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + atten_out = self.self_attn( + hidden_states=hidden_states, + past_key_values=self_attn_past_key_value, + attention_mask=(1 - attention_mask) * torch.finfo(hidden_states.dtype).min, + layer_head_mask=head_mask, + output_attentions=output_attentions, + ) + if output_attentions: + attn_weights = (atten_out[1],) + else: + attn_weights = () + + attention_output = atten_out[0] + + hidden = hidden_states + self.norm(attention_output) + + if use_cache: + outputs = (hidden, atten_out[2]) # hidden, present, (attentions) + else: + outputs = (hidden,) # hidden, (attentions) + + return outputs + attn_weights + + +class GPTSanJapaneseBlock(nn.Module): + """ + Self Attention and FFN Unit + """ + + def __init__(self, config, ext_layer=False): + super().__init__() + self.self_attn = GPTSanJapaneseLayerSelfAttention(config) + self.feed_forward = GPTSanJapaneseLayerDenseFF(config) if ext_layer else GPTSanJapaneseLayerSparseFF(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + output_router_tuple: Optional[bool] = False, + ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]: + r""" + GPTSAN transformer block. + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up + decoding. If `past_key_values` are used, the user can optionally input only the last + `decoder_input_ids` (those that don't have their past key value states given to this model) of shape + `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + output_attentions (`bool`) : + output attention probabirities. + output_router_tuple: + output experts router logits and expert id. + Returns: + tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...] + """ + atten_out = self.self_attn( + hidden_states=hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + attention_output = atten_out[0] + + if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF): + sparse_out = self.feed_forward(attention_output, output_router_tuple) + if output_router_tuple: + hidden, router_tuple = sparse_out + else: + hidden = sparse_out + else: + hidden = self.feed_forward(attention_output) + + outputs = (hidden,) + atten_out[1:] + + if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF) and output_router_tuple: + outputs += (router_tuple,) + + return outputs + + +class GPTSanJapanesePreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: GPTSanJapaneseConfig + base_model_prefix = "gptsan_japanese" + supports_gradient_checkpointing = False + _no_split_modules = ["GPTSanJapaneseBlock"] + _skip_keys_device_placement = "past_key_values" + + @property + def dummy_inputs(self): + input_ids = torch.tensor(DUMMY_INPUTS) + input_mask = torch.tensor(DUMMY_MASK) + dummy_inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + } + return dummy_inputs + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor # Used for testing weights initialization + if isinstance(module, nn.LayerNorm): + module.weight.data.fill_(factor * 1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, GPTSanJapaneseModel): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.embed_tokens.weight.data.normal_(mean=0.0, std=factor * 1.0) + module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "extra_position_embeddings") and module.extra_position_embeddings is not None: + module.extra_position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, (GPTSanJapaneseModel, GPTSanJapaneseForConditionalGeneration)): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.final_logits_bias.data.normal_(mean=0.0, std=factor * 1.0) + if hasattr(module, "lm_head") and not self.config.tie_word_embeddings: + module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0) + elif isinstance(module, GPTSanJapaneseDenseActDense): + # Mesh TensorFlow FF initialization + # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi, "bias") and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, GPTSanJapaneseAttention): + # Multi-headed attention + d_model = self.config.d_model + key_value_proj_dim = self.config.d_model + n_heads = self.config.num_heads + module.k_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) + module.v_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) + module.q_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) + module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5)) + elif isinstance(module, GPTSanJapaneseSparseMLP): + # Mesh TensorFlow attention initialization to avoid scaling before softmax + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + key_value_proj_dim = self.config.d_model + n_heads = self.config.num_heads + module.router.classifier.weight.data.normal_(mean=0.0, std=factor * 1) + for idx in range(self.config.num_experts): + module.experts[f"expert_{idx}"].wi.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.experts[f"expert_{idx}"].wo.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + if decoder_start_token_id is None: + raise ValueError( + "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. " + "See T5 docs for more information." + ) + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id) + shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +GPTSAN_JAPANESE_START_DOCSTRING = r""" + + The [GPTSAN-japanese](https://github.com/tanreinama/GPTSAN) model was proposed in General-purpose Swich transformer + based Japanese language model + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GPTSAN_JAPANESE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. GPTSAN-japanese is a model that generates sentence + continuations or predicts tokens at mask positions. Special tokens required for inputs to the model are + automatically appended. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + An input that masks the Prefix part in the Prefix-LM input. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **prefix** input, + - 0 for tokens that are **not-prefix** input. + spout (`torch.Tensor` of shape `(batch_size, config.d_spout)`): + This vector is transformed through an 8-layer FFN and can be used instead of `past_key_values`. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models. +""" + + +@add_start_docstrings( + "The bare GPTSAN-japanese Model transformer outputting raw hidden-states without any specific head on top.", + GPTSAN_JAPANESE_START_DOCSTRING, +) +class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel): + def __init__(self, config: GPTSanJapaneseConfig): + super().__init__(config) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model) + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model) + self.last_project = nn.Linear(config.d_model, config.d_model, bias=True) + self.act = ACT2FN["swish"] + + self.blocks = torch.nn.ModuleList([]) + for _ in range(config.num_switch_layers): + self.blocks.append(GPTSanJapaneseBlock(config)) + for _ in range(config.num_ext_layers): + self.blocks.append(GPTSanJapaneseBlock(config, ext_layer=True)) + + if config.num_ext_layers > 0: + self.extra_position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model) + + if config.d_spout: + spouts = [] + for _ in range(8): + spouts.append(nn.Linear(config.d_spout, config.d_spout, bias=False)) + spouts.append(nn.Tanh()) + spouts.append(nn.Linear(config.d_spout, config.num_layers * 2 * config.d_model, bias=False)) + self.spout = nn.Sequential(*spouts) + + self.post_init() + + def set_input_embeddings(self, new_embeddings): + self.embed_tokens = new_embeddings + + @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.FloatTensor] = None, + spout: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + num_precontext: Optional[torch.LongTensor] = None, + ) -> Union[MoEModelOutputWithPastAndCrossAttentions, tuple[torch.FloatTensor]]: + r""" + num_precontext (`torch.LongTensor` of shape `(batch_size,1)`): + length of `hybrid` input tokens in the input. Tokens up to this length refer to both front and back like + BERT, tokens after that refer only to front like GPT. see also: + https://github.com/tanreinama/GPTSAN/blob/main/report/model.md + + Returns: + `MoEModelOutputWithPastAndCrossAttentions` or `tuple` if `return_dict` returns + MoEModelOutputWithPastAndCrossAttentions instead of tuple + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + device = self.position_embeddings.weight.device + if input_ids is None: + input_ids = torch.zeros([1, 1]).int().to(device) # dummy for input_ids was None + if inputs_embeds is not None: + raise NotImplementedError( + "GPTSanJapaneseModel does not use `inputs_embeds`. Make sure to pass in `input_ids` instead." + ) + num_pasts_contexts = 0 + num_batch = input_ids.shape[0] + pasts_or_spout_value = None + if past_key_values is not None: + num_pasts_contexts = past_key_values.get_seq_length() + elif self.config.d_spout and spout is not None: + # `spout` is a special input vector specific to GPTSAN + # This controls the output by projecting embedded information such as the class of sentences during learning. + # It should passed instead of the first past_key_values. + # See the original GPTSAN repository for details + num_pasts_contexts += 1 + + # If there is an attention_mask, increase first one for spout + if self.config.d_spout and spout is not None and attention_mask is not None: + attention_mask_with_spout = torch.ones(num_batch, attention_mask.shape[1] + 1, device=device) + attention_mask_with_spout[:, 1:] -= 1 - attention_mask # 1st token should be spout + attention_mask = attention_mask_with_spout # update attention_mask + + if num_precontext is not None: + # `num_precontext` is the number of tokens that refer to each other in prefix-lm + # created per batch, so dimension of num_precontext should be [batch, 1] + if not ( + len(num_precontext.shape) == 2 and num_precontext.shape[1] == 1 + ): # num_precontext Should be [batch,1] + raise ValueError("num_precontext should be [batch, 1] size.") + num_precontext = torch.reshape(num_precontext, [-1]) + else: + num_precontext = torch.zeros([num_batch]).int().to(device) + + num_input_contexts = input_ids.shape[1] + num_output_contexts = num_input_contexts + num_pasts_contexts + + hidden_states = self.embed_tokens(input_ids) + + if past_key_values is not None: + pasts_or_spout_value = past_key_values + elif self.config.d_spout and spout is not None: + # Make vector from `spout` of GPTSAN to the same shape as past_key_values + pasts_or_spout_value = self.spout(spout) # projecting `spout` vector + pasts_or_spout_value = torch.reshape( + pasts_or_spout_value, + [ + num_batch, + self.config.num_layers, + 2, + self.config.num_heads, + num_pasts_contexts, + self.config.d_model // self.config.num_heads, + ], + ) + pasts_or_spout_value = torch.split(pasts_or_spout_value, [1] * self.config.num_layers, dim=1) + # make same shape as past_key_values + pasts_or_spout_value = tuple( + tuple(b.squeeze(1) for b in torch.split(a.squeeze(1), [1, 1], dim=1)) for a in pasts_or_spout_value + ) + else: + pasts_or_spout_value = [None] * self.config.num_layers + + # Token position considering spout and pasts + token_position = torch.arange(num_input_contexts).to(device) + num_pasts_contexts + + if attention_mask is None: + attention_mask = torch.ones(num_batch, num_input_contexts, device=device) + + # positions for get position_embeddings + gather_position = ( + ( + torch.zeros((num_batch, self.config.d_model, num_input_contexts)).to(device) + + token_position.unsqueeze(0) + ) + .transpose(1, 2) + .long() + ) + # When padding with padding_side="left", zeros line up on the left side of attention_mask, so position_embeddings is shifted accordingly + gather_position -= (1 - attention_mask).argmin(dim=-1).unsqueeze(1).unsqueeze(2) + gather_position = torch.clip(gather_position, num_pasts_contexts, self.config.max_position_embeddings - 1) + + # attention_mask is applied per batch + for i in range(num_batch): + hidden_states[i] += torch.gather(self.position_embeddings.weight, dim=0, index=gather_position[i]) + + # Create a mask to be used when making the prefix Input length of Prefix-LM variable + causal_mask = ( + torch.tril(torch.ones((num_output_contexts, num_output_contexts), dtype=torch.uint8)) + .view(1, 1, num_output_contexts, num_output_contexts) + .to(device) + ) + prefix_lm_mask = causal_mask[:, :, -num_input_contexts:, :] + if token_type_ids is not None: + token_type_ids = token_type_ids.unsqueeze(1).unsqueeze(2) + prefix_lm_mask = ((prefix_lm_mask + token_type_ids) > 0).float() + # Merge prefix_lm_mask and attention_mask + extended_attention_mask = prefix_lm_mask * attention_mask.unsqueeze(1).unsqueeze(2) + + # Prepare head mask if needed + if head_mask is not None: + head_mask = self.get_head_mask( + head_mask, self.config.num_switch_layers + self.config.num_ext_layers + ) # n_layer x batch x n_heads x N x N + + # outputs + present_key_value_states = () if self.config.use_cache or use_cache else None + all_hidden_states = () if self.config.output_hidden_states or output_hidden_states else None + all_attentions = () if self.config.output_attentions or output_attentions else None + all_router_probs = () if self.config.output_router_logits or output_router_logits else None + + for layer, past in enumerate(pasts_or_spout_value): + if layer == self.config.num_switch_layers: + if self.config.num_ext_layers > 0: + # extra_position_embeddings are extra position embeddings that are only created when extending the model with code from the original GPTSAN repository. Not used in the default model. + # However, it is created when you create an additional layer and partially train only that location. + # Therefore, convert_gptsan_tf_checkpoint_to_pytorch.py is used when converting and loading models created in the original GPTSAN repository. + for i in range(num_batch): + hidden_states[i] += torch.gather( + self.extra_position_embeddings.weight, dim=0, index=gather_position[i] + ) + + output_router_tuple = ( + self.config.output_router_logits or output_router_logits + ) and layer < self.config.num_switch_layers + block_output = self.blocks[layer]( + hidden_states=hidden_states, + past_key_values=past, + attention_mask=extended_attention_mask, + head_mask=head_mask, + use_cache=self.config.use_cache or use_cache, + output_attentions=self.config.output_attentions or output_attentions, + output_router_tuple=output_router_tuple, + ) + + outpos = 0 + hidden_states = block_output[outpos] + if self.config.output_hidden_states or output_hidden_states: + all_hidden_states += (hidden_states,) + if self.config.use_cache or use_cache: + outpos += 1 + present = block_output[outpos] + present_key_value_states += (present,) + if self.config.output_attentions or output_attentions: + outpos += 1 + attention_probs = block_output[outpos] + all_attentions += (attention_probs,) + if output_router_tuple: + outpos += 1 + router_tuple = block_output[outpos] + all_router_probs.append(router_tuple[0]) + + hidden_states = self.last_project(hidden_states) + hidden_states = self.act(hidden_states) + + if self.config.output_hidden_states or output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + present_key_value_states, + all_hidden_states, + all_attentions, + all_router_probs, + ] + if v is not None + ) + + return MoEModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_value_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + router_probs=all_router_probs, + ) + + +@add_start_docstrings( + "The bare GPTSAN-japanese Model with a language modeling head.", + GPTSAN_JAPANESE_START_DOCSTRING, +) +class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: GPTSanJapaneseConfig): + super().__init__(config) + self.model = GPTSanJapaneseModel(config) + self.register_buffer("final_logits_bias", torch.zeros([1, config.vocab_size])) + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) + if not self.config.torchscript: + self.lm_head.weight = self.model.embed_tokens.weight + + @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.FloatTensor] = None, + spout: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + output_router_logits: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[tuple[torch.FloatTensor], MoECausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification loss. Indices should be in `[-100, 0, ..., + config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for + labels in `[0, ..., config.vocab_size]` + + Returns: + `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast instead of tuple + + Example: + + Text Generation with regular LM Model + ```python + >>> from transformers import AutoModel, AutoTokenizer, trainer_utils + + >>> device = "cuda" + >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device) + >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") + >>> x_token = tokenizer("織田信長は、", return_tensors="pt") + >>> trainer_utils.set_seed(30) + >>> input_ids = x_token.input_ids.to(device) + >>> gen_token = model.generate(input_ids, max_new_tokens=50) + >>> tokenizer.decode(gen_token[0]) + "織田信長は、政治・軍事の中枢まで掌握した政治家であり、日本史上類を見ない驚異的な軍事侵攻を続け..." + ``` + + Text Generation with Prefix-LM Model + ```python + >>> from transformers import AutoModel, AutoTokenizer, trainer_utils + + >>> device = "cuda" + >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device) + >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") + >>> x_token = tokenizer("", prefix_text="織田信長は、", return_tensors="pt") + >>> trainer_utils.set_seed(30) + >>> input_ids = x_token.input_ids.to(device) + >>> token_type_ids = x_token.token_type_ids.to(device) + >>> gen_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50) + >>> tokenizer.decode(gen_token[0]) + "織田信長は、政治・外交で数々の戦果を上げるが、1568年からは、いわゆる本能寺の変で細川晴元に暗殺される..." + ``` + + Simultaneously Text Generation And Masked Language Model + ```python + >>> from transformers import AutoModel, AutoTokenizer, trainer_utils + + >>> device = "cuda" + >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device) + >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") + >>> masked_sentence = "武田信玄は、<|inputmask|>時代ファンならぜひ押さえ<|inputmask|>きたい名将の一人。" + >>> x_token = tokenizer("", prefix_text=masked_sentence, return_tensors="pt") + >>> trainer_utils.set_seed(30) + >>> input_ids = x_token.input_ids.to(device) + >>> token_type_ids = x_token.token_type_ids.to(device) + >>> out_lm_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50) + >>> out_mlm_token = model(input_ids, token_type_ids=token_type_ids).logits.argmax(axis=-1) + >>> tokenizer.decode(out_mlm_token[0]) + "武田信玄は、戦国時代ファンならぜひ押さえておきたい名将の一人。" + + >>> tokenizer.decode(out_lm_token[0][input_ids.shape[1] :]) + "武田氏の三代に渡った武田家のひとり\n甲斐市に住む、日本史上最大の戦国大名。..." + ```""" + SEG_TOKEN = self.config.separator_token_id + use_cache = use_cache or self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + model_return_dict = True + num_precontext = None + if input_ids is not None: + num_batch = input_ids.shape[0] + num_precontext = torch.zeros([num_batch]).int().to(input_ids.device) + where_separators = torch.where(input_ids == SEG_TOKEN) + num_precontext[where_separators[0]] += where_separators[1] + num_precontext = num_precontext.unsqueeze(1) + + outputs = self.model( + input_ids, + attention_mask, + token_type_ids, + spout, + past_key_values, + head_mask, + use_cache, + inputs_embeds, + decoder_inputs_embeds, + output_attentions, + output_hidden_states, + model_return_dict, + output_router_logits, + num_precontext, + ) + + lm_logits = self.lm_head(outputs[0]) + if lm_logits.shape[-1] == self.final_logits_bias.shape[-1]: + lm_logits = lm_logits + self.final_logits_bias + + loss = None + z_loss = None + router_probs = None + aux_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(lm_logits.device) + + loss_fct = nn.CrossEntropyLoss(ignore_index=-100) + + if output_router_logits: + # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder + router_logits, expert_indexes = self._unpack_router_logits(outputs.router_probs) + z_loss = router_z_loss_func(router_logits) + router_probs = nn.Softmax(dim=-1)(router_logits) + aux_loss = load_balancing_loss_func(router_probs, expert_indexes) + + loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) + + if not return_dict: + return tuple( + v + for v in [ + loss, + lm_logits, + outputs.past_key_values, + outputs.hidden_states, + outputs.router_probs, + z_loss, + aux_loss, + ] + if v is not None + ) + + return MoECausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_probs, + z_loss=z_loss, + aux_loss=aux_loss, + ) + + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor, + attention_mask: torch.FloatTensor, + token_type_ids: Optional[torch.FloatTensor] = None, + spout: Optional[Union[list, torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + **kwargs, + ): + if isinstance(spout, list): + spout = torch.tensor(spout).float() + if input_ids is not None: + spout = spout.to(input_ids.device) + if past_key_values is not None: + return { + "input_ids": input_ids[:, -1:] if input_ids is not None else None, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids[:, -1:] if token_type_ids is not None else None, + "spout": spout, + "past_key_values": past_key_values, + } + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + "spout": spout, + "past_key_values": None, + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return self._shift_right(labels) + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.model.set_input_embeddings(new_embeddings) + + def _unpack_router_logits(self, router_outputs): + total_router_logits = [] + total_expert_indexes = [] + for router_output in router_outputs: + if len(router_output[0].shape) > 1: + router_logits, expert_indexes = router_output + total_router_logits.append(router_logits) + total_expert_indexes.append(expert_indexes) + return torch.cat(total_router_logits, dim=1), torch.cat(total_expert_indexes, dim=1) + + +__all__ = ["GPTSanJapaneseForConditionalGeneration", "GPTSanJapaneseModel", "GPTSanJapanesePreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..1025fdf75fb46a0d350359a66c09315bd42effd3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py @@ -0,0 +1,518 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for GPTSANJapanese.""" + +import collections +import json +import os +import re +import sys +from typing import Optional, Union + +import numpy as np + +from ....tokenization_utils import PreTrainedTokenizer +from ....tokenization_utils_base import ( + BatchEncoding, + PreTokenizedInput, + PreTokenizedInputPair, + TextInput, + TextInputPair, + TruncationStrategy, +) +from ....utils import PaddingStrategy, logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"} + + +def load_vocab_and_emoji(vocab_file, emoji_file): + """Loads a vocabulary file and emoji file into a dictionary.""" + with open(emoji_file, "r", encoding="utf-8") as f: + emoji = json.loads(f.read()) + + vocab = collections.OrderedDict() + raw_vocab = collections.OrderedDict() + ids_to_tokens = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as f: + token = f.readlines() + token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token] + for idx, b in enumerate(token): + ids_to_tokens[idx] = b + raw_vocab[",".join(b)] = idx + for wd in b: + vocab[wd] = idx + + return vocab, raw_vocab, ids_to_tokens, emoji + + +class GPTSanJapaneseTokenizer(PreTrainedTokenizer): + """ + This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications + - Decoding byte0~byte255 tokens correctly + - Added bagofword token handling + - Return token_type_ids for Prefix-LM model + The bagofword token represents a repetition of the previous token and is converted to 3 consecutive tokens when + decoding In addition, the original Japanese special Sub-Word-Encoding has been released in this repository + (https://github.com/tanreinama/Japanese-BPEEncoder_V2). The token_type_ids is a mask indicating the prefix input + position of the Prefix-LM model. To specify a prefix position, specify a prefix input for prefix_text, or specify a + sentence of the prefix part and the part after it as a text pair of batch input. + + Example: + + ```python + >>> from transformers import GPTSanJapaneseTokenizer + + >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") + >>> # You can confirm both 慶応 and 慶應 are encoded to 17750 + >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"] + [35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281] + + >>> # Both 慶応 and 慶應 are decoded to 慶応 + >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]) + '吾輩は猫である🐯。実は慶応(慶応)大学出身' + ``` + + Example for Prefix-LM: + + ```python + >>> from transformers import GPTSanJapaneseTokenizer + + >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") + >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"] + [35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281] + + >>> # Mask for Prefix-LM inputs + >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"] + [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ``` + + Example for batch encode: + + ```python + >>> from transformers import GPTSanJapaneseTokenizer + + >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") + >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"] + [[35993, 35998, 8640, 25948, 35993, 35998, 30647, 35675, 35999, 35999], [35993, 35998, 10382, 9868, 35993, 35998, 30646, 9459, 30646, 35675]] + + >>> # Mask for Prefix-LM inputs + >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"] + [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]] + + >>> # Mask for padding + >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"] + [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]] + ``` + + Args: + vocab_file (`str`): + File containing the vocabulary. + emoji_file (`str`): + File containing the emoji. + unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`): + The token used for unknown character + pad_token (`str`, *optional*, defaults to `"<|separator|>"`): + The token used for padding + bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`): + A special token to separate token to prefix part and general input part. + do_clean_text (`bool`, *optional*, defaults to `False`): + Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask", "token_type_ids"] + + def __init__( + self, + vocab_file, + emoji_file, + unk_token="<|nottoken|>", + pad_token="<|separator|>", + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + sep_token="<|segmenter|>", + do_clean_text=False, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + if not os.path.isfile(emoji_file): + raise ValueError( + f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google" + " pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.do_clean_text = do_clean_text + self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file) + self.subword_tokenizer = SubWordJapaneseTokenizer( + vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji + ) + + super().__init__( + unk_token=unk_token, + pad_token=pad_token, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + do_clean_text=do_clean_text, + **kwargs, + ) + + @property + def vocab_size(self): + # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab + return len(self.raw_vocab) + + def get_vocab(self): + return dict(self.raw_vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.subword_tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + words = [] + byte_tokens = [] + for word in tokens: + if word[:6] == "<|byte" and word[-2:] == "|>": + byte_tokens.append(int(word[6:-2])) + else: + if len(byte_tokens) > 0: + words.append(bytearray(byte_tokens).decode("utf-8", errors="replace")) + byte_tokens = [] + if word[:7] == "<|emoji" and word[-2:] == "|>": + words.append(self.emoji["emoji_inv"][word]) + elif word == "": + words.append(" ") + elif word == "
": + words.append("\n") + elif word == "": + words.append("\t") + elif word == "": + words.append("▀") + elif word == "": + words.append("ǀ") + elif word == "": + words.append("‖") + elif word == "<|bagoftoken|>": + if len(words) > 0: + words.append(words[-1]) + words.append(words[-1]) + words.append(words[-1]) + elif word.startswith("<|") and word.endswith("|>"): + words.append("") + else: + words.append(word) + if len(byte_tokens) > 0: + words.append(bytearray(byte_tokens).decode("utf-8", errors="replace")) + text = "".join(words) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + emoji_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"] + ) + else: + vocab_file = ( + (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"] + ) + emoji_file = ( + (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"] + ) + with open(vocab_file, "w", encoding="utf-8") as writer: + for token_index, token in self.ids_to_tokens.items(): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(",".join(token) + "\n") + index += 1 + with open(emoji_file, "w", encoding="utf-8") as writer: + json.dump(self.emoji, writer) + return vocab_file, emoji_file + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + # docstyle-ignore + """ + The tokenizer returns token_type_ids as separators between the Prefix part and the rest. + token_type_ids is 1 for the Prefix part and 0 for the rest of the token. + + Example: + ```python + >>> from transformers import GPTSanJapaneseTokenizer + + >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") + >>> x_token = tokenizer("アイウエ") + >>> # input_ids: | SOT | SEG | ア | イ | ウ | エ | + >>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 | + + >>> x_token = tokenizer("", prefix_text="アイウエ") + >>> # input_ids: | SOT | ア | イ | ウ | エ | SEG | + >>> # token_type_ids: | 1 | 1 | 1 | 1 | 1 | 0 | + + >>> x_token = tokenizer("ウエ", prefix_text="アイ") + >>> # input_ids: | SOT | ア | イ | SEG | ウ | エ | + >>> # token_type_ids: | 1 | 1 | 1 | 0 | 0 | 0 | + ```""" + prefix_len = 0 + if self.sep_token in self.vocab: + segid = self.vocab[self.sep_token] + if segid in token_ids_0: + prefix_len = token_ids_0.index(segid) + if token_ids_1 is None: + total_len = len(token_ids_0) + else: + total_len = len(token_ids_0 + token_ids_1) + return prefix_len * [1] + (total_len - prefix_len) * [0] + + def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs): + # GPTSAN inserts extra SEP tokens in Prefix-LM in addition to SOT for text generation. + # SOT at the beginning of the text, and SEP at the separator between the Prefix part and the rest. + if add_sep_token is None: + add_sep_token = self.sep_token not in text # If insert un-prefix position explicitly + prepared = self.bos_token if self.bos_token in self.vocab else "" + prepared += prefix_text if prefix_text is not None else "" + if add_sep_token: + prepared += self.sep_token if self.sep_token in self.vocab else "" + prepared += text + return (prepared, kwargs) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + list[TextInput], list[TextInputPair], list[PreTokenizedInput], list[PreTokenizedInputPair] + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + # This tokenizer converts input text pairs into Prefix input and subsequent input + if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list): + # As a single text with an explicit un-prefix position + batch_prefix_texts = [] + for pref, txt in batch_text_or_text_pairs: + batch_prefix_texts.append(pref + self.sep_token + txt) + batch_text_or_text_pairs = batch_prefix_texts + + return super()._batch_encode_plus( + batch_text_or_text_pairs, + add_special_tokens, + padding_strategy, + truncation_strategy, + max_length, + stride, + is_split_into_words, + pad_to_multiple_of, + return_tensors, + return_token_type_ids, + return_attention_mask, + return_overflowing_tokens, + return_special_tokens_mask, + return_offsets_mapping, + return_length, + verbose, + **kwargs, + ) + + +class SubWordJapaneseTokenizer: + """ + This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications + - Decoding byte0~byte255 tokens correctly + - Added bagofword token handling + + https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the + original repository. + + MIT License + + Copyright (c) 2020 tanreinama + + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + documentation files (the "Software"), to deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all copies or substantial portions of + the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO + THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + + def __init__(self, vocab, ids_to_tokens, emoji): + self.vocab = vocab # same as swe + self.ids_to_tokens = ids_to_tokens # same as bpe + self.emoji = emoji + self.maxlen = np.max([len(w) for w in self.vocab]) + self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)") + self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*") + self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}") + self.content_repatter4 = re.compile( + r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" + ) + self.content_repatter5 = re.compile( + r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*" + ) + # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using + # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly + # different regex that should generally have the same behaviour in most non-pathological cases. + if sys.version_info >= (3, 11): + self.content_repatter6 = re.compile( + r"(?:\d,\d{3}|[\d億])*+" + r"(?:\d,\d{3}|[\d万])*+" + r"(?:\d,\d{3}|[\d千])*+" + r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+" + r"(?:\(税込\)|\(税抜\)|\+tax)*" + ) + else: + self.content_repatter6 = re.compile( + r"(?:\d,\d{3}|[\d億万千])*" + r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+" + r"(?:\(税込\)|\(税抜\)|\+tax)*" + ) + keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿" + blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟" + self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "")) + + def __len__(self): + return len(self.ids_to_tokens) + + def clean_text(self, content): + content = self.content_repatter1.sub("", content) + content = self.content_repatter2.sub("", content) + content = self.content_repatter3.sub("", content) + content = self.content_repatter4.sub("", content) + content = self.content_repatter5.sub("", content) + content = self.content_repatter6.sub("", content) + content = content.translate(self.content_trans1) + while "" in content: + content = content.replace("", "") + return content + + def tokenize(self, text, clean=False): + text = text.replace(" ", "") + text = text.replace(" ", "") + text = text.replace("\r\n", "
") + text = text.replace("\n", "
") + text = text.replace("\r", "
") + text = text.replace("\t", "") + text = text.replace("—", "ー") + text = text.replace("−", "ー") + for k, v in self.emoji["emoji"].items(): + if k in text: + text = text.replace(k, v) + if clean: + text = self.clean_text(text) + + def check_simbol(x): + e = x.encode() + if len(x) == 1 and len(e) == 2: + c = (int(e[0]) << 8) + int(e[1]) + if ( + (c >= 0xC2A1 and c <= 0xC2BF) + or (c >= 0xC780 and c <= 0xC783) + or (c >= 0xCAB9 and c <= 0xCBBF) + or (c >= 0xCC80 and c <= 0xCDA2) + ): + return True + return False + + def checku2e(x): + e = x.encode() + if len(x) == 1 and len(e) == 3: + c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2]) + if c >= 0xE28080 and c <= 0xE2B07F: + return True + return False + + pos = 0 + result = [] + while pos < len(text): + end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3 + candidates = [] # (token_id, token, pos) + for e in range(end, pos, -1): + wd = text[pos:e] + if wd in self.vocab: + if wd[0] == "<" and len(wd) > 2: + candidates = [(self.vocab[wd], wd, e)] + break + else: + candidates.append((self.vocab[wd], wd, e)) + if len(candidates) > 0: + # the smallest token_id is adopted + _, wd, e = min(candidates, key=lambda x: x[0]) + result.append(wd) + pos = e + else: + end = pos + 1 + wd = text[pos:end] + if check_simbol(wd): + result.append("") + elif checku2e(wd): + result.append("") + else: + for i in wd.encode("utf-8"): + result.append("<|byte%d|>" % i) + pos = end + return result + + def convert_id_to_token(self, index): + return self.ids_to_tokens[index][0] + + +__all__ = ["GPTSanJapaneseTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3a4b3eb1be2b4e69dcad1540abdd91f412de0ca2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_graphormer import * + from .modeling_graphormer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx new file mode 100644 index 0000000000000000000000000000000000000000..a0fafbdee53b55efb9596036817b03be0d006992 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx @@ -0,0 +1,107 @@ +# Copyright (c) Microsoft Corporation and HuggingFace +# Licensed under the MIT License. + +import cython + +cimport numpy +from cython.parallel cimport parallel, prange + +import numpy as np + + +# Reduce this number if matrices are too big for large graphs +UNREACHABLE_NODE_DISTANCE = 510 + +def floyd_warshall(adjacency_matrix): + """ + Applies the Floyd-Warshall algorithm to the adjacency matrix, to compute the + shortest paths distance between all nodes, up to UNREACHABLE_NODE_DISTANCE. + """ + (nrows, ncols) = adjacency_matrix.shape + assert nrows == ncols + cdef unsigned int n = nrows + + adj_mat_copy = adjacency_matrix.astype(np.int32, order='C', casting='safe', copy=True) + assert adj_mat_copy.flags['C_CONTIGUOUS'] + cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] M = adj_mat_copy + cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] path = -1 * np.ones([n, n], dtype=np.int32) + + cdef unsigned int i, j, k + cdef numpy.int32_t M_ij, M_ik, cost_ikkj + cdef numpy.int32_t* M_ptr = &M[0,0] + cdef numpy.int32_t* M_i_ptr + cdef numpy.int32_t* M_k_ptr + + # set unreachable nodes distance to UNREACHABLE_NODE_DISTANCE + for i in range(n): + for j in range(n): + if i == j: + M[i][j] = 0 + elif M[i][j] == 0: + M[i][j] = UNREACHABLE_NODE_DISTANCE + + # floyed algo + for k in range(n): + M_k_ptr = M_ptr + n*k + for i in range(n): + M_i_ptr = M_ptr + n*i + M_ik = M_i_ptr[k] + for j in range(n): + cost_ikkj = M_ik + M_k_ptr[j] + M_ij = M_i_ptr[j] + if M_ij > cost_ikkj: + M_i_ptr[j] = cost_ikkj + path[i][j] = k + + # set unreachable path to UNREACHABLE_NODE_DISTANCE + for i in range(n): + for j in range(n): + if M[i][j] >= UNREACHABLE_NODE_DISTANCE: + path[i][j] = UNREACHABLE_NODE_DISTANCE + M[i][j] = UNREACHABLE_NODE_DISTANCE + + return M, path + + +def get_all_edges(path, i, j): + """ + Recursive function to compute all possible paths between two nodes from the graph adjacency matrix. + """ + cdef int k = path[i][j] + if k == -1: + return [] + else: + return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j) + + +def gen_edge_input(max_dist, path, edge_feat): + """ + Generates the full edge feature and adjacency matrix. + Shape: num_nodes * num_nodes * max_distance_between_nodes * num_edge_features + Dim 1 is the input node, dim 2 the output node of the edge, dim 3 the depth of the edge, dim 4 the feature + """ + (nrows, ncols) = path.shape + assert nrows == ncols + cdef unsigned int n = nrows + cdef unsigned int max_dist_copy = max_dist + + path_copy = path.astype(long, order='C', casting='safe', copy=True) + edge_feat_copy = edge_feat.astype(long, order='C', casting='safe', copy=True) + assert path_copy.flags['C_CONTIGUOUS'] + assert edge_feat_copy.flags['C_CONTIGUOUS'] + + cdef numpy.ndarray[numpy.int32_t, ndim=4, mode='c'] edge_fea_all = -1 * np.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=np.int32) + cdef unsigned int i, j, k, num_path, cur + + for i in range(n): + for j in range(n): + if i == j: + continue + if path_copy[i][j] == UNREACHABLE_NODE_DISTANCE: + continue + path = [i] + get_all_edges(path_copy, i, j) + [j] + num_path = len(path) - 1 + for k in range(num_path): + edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :] + + return edge_fea_all diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py new file mode 100644 index 0000000000000000000000000000000000000000..88657bab435d95cf5f06014524b184939e8582c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py @@ -0,0 +1,135 @@ +# Copyright (c) Microsoft Corporation and HuggingFace +# Licensed under the MIT License. + +from collections.abc import Mapping +from typing import Any + +import numpy as np +import torch + +from ....utils import is_cython_available, requires_backends + + +if is_cython_available(): + import pyximport + + pyximport.install(setup_args={"include_dirs": np.get_include()}) + from . import algos_graphormer + + +def convert_to_single_emb(x, offset: int = 512): + feature_num = x.shape[1] if len(x.shape) > 1 else 1 + feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64) + x = x + feature_offset + return x + + +def preprocess_item(item, keep_features=True): + requires_backends(preprocess_item, ["cython"]) + + if keep_features and "edge_attr" in item: # edge_attr + edge_attr = np.asarray(item["edge_attr"], dtype=np.int64) + else: + edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64) # same embedding for all + + if keep_features and "node_feat" in item: # input_nodes + node_feature = np.asarray(item["node_feat"], dtype=np.int64) + else: + node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64) # same embedding for all + + edge_index = np.asarray(item["edge_index"], dtype=np.int64) + + input_nodes = convert_to_single_emb(node_feature) + 1 + num_nodes = item["num_nodes"] + + if len(edge_attr.shape) == 1: + edge_attr = edge_attr[:, None] + attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64) + attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1 + + # node adj matrix [num_nodes, num_nodes] bool + adj = np.zeros([num_nodes, num_nodes], dtype=bool) + adj[edge_index[0], edge_index[1]] = True + + shortest_path_result, path = algos_graphormer.floyd_warshall(adj) + max_dist = np.amax(shortest_path_result) + + input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type) + attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single) # with graph token + + # combine + item["input_nodes"] = input_nodes + 1 # we shift all indices by one for padding + item["attn_bias"] = attn_bias + item["attn_edge_type"] = attn_edge_type + item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1 # we shift all indices by one for padding + item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1 # we shift all indices by one for padding + item["out_degree"] = item["in_degree"] # for undirected graph + item["input_edges"] = input_edges + 1 # we shift all indices by one for padding + if "labels" not in item: + item["labels"] = item["y"] + + return item + + +class GraphormerDataCollator: + def __init__(self, spatial_pos_max=20, on_the_fly_processing=False): + if not is_cython_available(): + raise ImportError("Graphormer preprocessing needs Cython (pyximport)") + + self.spatial_pos_max = spatial_pos_max + self.on_the_fly_processing = on_the_fly_processing + + def __call__(self, features: list[dict]) -> dict[str, Any]: + if self.on_the_fly_processing: + features = [preprocess_item(i) for i in features] + + if not isinstance(features[0], Mapping): + features = [vars(f) for f in features] + batch = {} + + max_node_num = max(len(i["input_nodes"]) for i in features) + node_feat_size = len(features[0]["input_nodes"][0]) + edge_feat_size = len(features[0]["attn_edge_type"][0][0]) + max_dist = max(len(i["input_edges"][0][0]) for i in features) + edge_input_size = len(features[0]["input_edges"][0][0][0]) + batch_size = len(features) + + batch["attn_bias"] = torch.zeros(batch_size, max_node_num + 1, max_node_num + 1, dtype=torch.float) + batch["attn_edge_type"] = torch.zeros(batch_size, max_node_num, max_node_num, edge_feat_size, dtype=torch.long) + batch["spatial_pos"] = torch.zeros(batch_size, max_node_num, max_node_num, dtype=torch.long) + batch["in_degree"] = torch.zeros(batch_size, max_node_num, dtype=torch.long) + batch["input_nodes"] = torch.zeros(batch_size, max_node_num, node_feat_size, dtype=torch.long) + batch["input_edges"] = torch.zeros( + batch_size, max_node_num, max_node_num, max_dist, edge_input_size, dtype=torch.long + ) + + for ix, f in enumerate(features): + for k in ["attn_bias", "attn_edge_type", "spatial_pos", "in_degree", "input_nodes", "input_edges"]: + f[k] = torch.tensor(f[k]) + + if len(f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max]) > 0: + f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max] = float("-inf") + + batch["attn_bias"][ix, : f["attn_bias"].shape[0], : f["attn_bias"].shape[1]] = f["attn_bias"] + batch["attn_edge_type"][ix, : f["attn_edge_type"].shape[0], : f["attn_edge_type"].shape[1], :] = f[ + "attn_edge_type" + ] + batch["spatial_pos"][ix, : f["spatial_pos"].shape[0], : f["spatial_pos"].shape[1]] = f["spatial_pos"] + batch["in_degree"][ix, : f["in_degree"].shape[0]] = f["in_degree"] + batch["input_nodes"][ix, : f["input_nodes"].shape[0], :] = f["input_nodes"] + batch["input_edges"][ + ix, : f["input_edges"].shape[0], : f["input_edges"].shape[1], : f["input_edges"].shape[2], : + ] = f["input_edges"] + + batch["out_degree"] = batch["in_degree"] + + sample = features[0]["labels"] + if len(sample) == 1: # one task + if isinstance(sample[0], float): # regression + batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features])) + else: # binary classification + batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features])) + else: # multi task classification, left to float to keep the NaNs + batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0)) + + return batch diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py new file mode 100644 index 0000000000000000000000000000000000000000..81c13c6a802b3128d4f88b23f59ac31e8484e901 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py @@ -0,0 +1,220 @@ +# coding=utf-8 +# Copyright 2022 Microsoft, clefourrier and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Graphormer model configuration""" + +from typing import Optional + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class GraphormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an + Graphormer model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Graphormer + [graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + num_classes (`int`, *optional*, defaults to 1): + Number of target classes or labels, set to n for binary classification of n tasks. + num_atoms (`int`, *optional*, defaults to 512*9): + Number of node types in the graphs. + num_edges (`int`, *optional*, defaults to 512*3): + Number of edges types in the graph. + num_in_degree (`int`, *optional*, defaults to 512): + Number of in degrees types in the input graphs. + num_out_degree (`int`, *optional*, defaults to 512): + Number of out degrees types in the input graphs. + num_edge_dis (`int`, *optional*, defaults to 128): + Number of edge dis in the input graphs. + multi_hop_max_dist (`int`, *optional*, defaults to 20): + Maximum distance of multi hop edges between two nodes. + spatial_pos_max (`int`, *optional*, defaults to 1024): + Maximum distance between nodes in the graph attention bias matrices, used during preprocessing and + collation. + edge_type (`str`, *optional*, defaults to multihop): + Type of edge relation chosen. + max_nodes (`int`, *optional*, defaults to 512): + Maximum number of nodes which can be parsed for the input graphs. + share_input_output_embed (`bool`, *optional*, defaults to `False`): + Shares the embedding layer between encoder and decoder - careful, True is not implemented. + num_layers (`int`, *optional*, defaults to 12): + Number of layers. + embedding_dim (`int`, *optional*, defaults to 768): + Dimension of the embedding layer in encoder. + ffn_embedding_dim (`int`, *optional*, defaults to 768): + Dimension of the "intermediate" (often named feed-forward) layer in encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads in the encoder. + self_attention (`bool`, *optional*, defaults to `True`): + Model is self attentive (False not implemented). + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention weights. + activation_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the activation of the linear transformer layer. + layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + bias (`bool`, *optional*, defaults to `True`): + Uses bias in the attention module - unsupported at the moment. + embed_scale(`float`, *optional*, defaults to None): + Scaling factor for the node embeddings. + num_trans_layers_to_freeze (`int`, *optional*, defaults to 0): + Number of transformer layers to freeze. + encoder_normalize_before (`bool`, *optional*, defaults to `False`): + Normalize features before encoding the graph. + pre_layernorm (`bool`, *optional*, defaults to `False`): + Apply layernorm before self attention and the feed forward network. Without this, post layernorm will be + used. + apply_graphormer_init (`bool`, *optional*, defaults to `False`): + Apply a custom graphormer initialisation to the model before training. + freeze_embeddings (`bool`, *optional*, defaults to `False`): + Freeze the embedding layer, or train it along the model. + encoder_normalize_before (`bool`, *optional*, defaults to `False`): + Apply the layer norm before each encoder block. + q_noise (`float`, *optional*, defaults to 0.0): + Amount of quantization noise (see "Training with Quantization Noise for Extreme Model Compression"). (For + more detail, see fairseq's documentation on quant_noise). + qn_block_size (`int`, *optional*, defaults to 8): + Size of the blocks for subsequent quantization with iPQ (see q_noise). + kdim (`int`, *optional*, defaults to None): + Dimension of the key in the attention, if different from the other values. + vdim (`int`, *optional*, defaults to None): + Dimension of the value in the attention, if different from the other values. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + traceable (`bool`, *optional*, defaults to `False`): + Changes return value of the encoder's inner_state to stacked tensors. + + Example: + ```python + >>> from transformers import GraphormerForGraphClassification, GraphormerConfig + + >>> # Initializing a Graphormer graphormer-base-pcqm4mv2 style configuration + >>> configuration = GraphormerConfig() + + >>> # Initializing a model from the graphormer-base-pcqm4mv1 style configuration + >>> model = GraphormerForGraphClassification(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "graphormer" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + num_classes: int = 1, + num_atoms: int = 512 * 9, + num_edges: int = 512 * 3, + num_in_degree: int = 512, + num_out_degree: int = 512, + num_spatial: int = 512, + num_edge_dis: int = 128, + multi_hop_max_dist: int = 5, # sometimes is 20 + spatial_pos_max: int = 1024, + edge_type: str = "multi_hop", + max_nodes: int = 512, + share_input_output_embed: bool = False, + num_hidden_layers: int = 12, + embedding_dim: int = 768, + ffn_embedding_dim: int = 768, + num_attention_heads: int = 32, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + layerdrop: float = 0.0, + encoder_normalize_before: bool = False, + pre_layernorm: bool = False, + apply_graphormer_init: bool = False, + activation_fn: str = "gelu", + embed_scale: Optional[float] = None, + freeze_embeddings: bool = False, + num_trans_layers_to_freeze: int = 0, + traceable: bool = False, + q_noise: float = 0.0, + qn_block_size: int = 8, + kdim: Optional[int] = None, + vdim: Optional[int] = None, + bias: bool = True, + self_attention: bool = True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + **kwargs, + ): + self.num_classes = num_classes + self.num_atoms = num_atoms + self.num_in_degree = num_in_degree + self.num_out_degree = num_out_degree + self.num_edges = num_edges + self.num_spatial = num_spatial + self.num_edge_dis = num_edge_dis + self.edge_type = edge_type + self.multi_hop_max_dist = multi_hop_max_dist + self.spatial_pos_max = spatial_pos_max + self.max_nodes = max_nodes + self.num_hidden_layers = num_hidden_layers + self.embedding_dim = embedding_dim + self.hidden_size = embedding_dim + self.ffn_embedding_dim = ffn_embedding_dim + self.num_attention_heads = num_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.layerdrop = layerdrop + self.encoder_normalize_before = encoder_normalize_before + self.pre_layernorm = pre_layernorm + self.apply_graphormer_init = apply_graphormer_init + self.activation_fn = activation_fn + self.embed_scale = embed_scale + self.freeze_embeddings = freeze_embeddings + self.num_trans_layers_to_freeze = num_trans_layers_to_freeze + self.share_input_output_embed = share_input_output_embed + self.traceable = traceable + self.q_noise = q_noise + self.qn_block_size = qn_block_size + + # These parameters are here for future extensions + # atm, the model only supports self attention + self.kdim = kdim + self.vdim = vdim + self.self_attention = self_attention + self.bias = bias + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + +__all__ = ["GraphormerConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py new file mode 100644 index 0000000000000000000000000000000000000000..b3e8ea742c8dabd4bd13e34ea84fa60547fc0645 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py @@ -0,0 +1,912 @@ +# coding=utf-8 +# Copyright 2022 Microsoft, clefourrier The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Graphormer model.""" + +import math +from collections.abc import Iterable, Iterator +from typing import Optional, Union + +import torch +import torch.nn as nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....modeling_outputs import ( + BaseModelOutputWithNoAttention, + SequenceClassifierOutput, +) +from ....modeling_utils import PreTrainedModel +from ....utils import logging +from .configuration_graphormer import GraphormerConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "graphormer-base-pcqm4mv1" +_CONFIG_FOR_DOC = "GraphormerConfig" + + +def quant_noise(module: nn.Module, p: float, block_size: int): + """ + From: + https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py + + Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product + Quantization as described in "Training with Quantization Noise for Extreme Model Compression" + + Args: + - module: nn.Module + - p: amount of Quantization Noise + - block_size: size of the blocks for subsequent quantization with iPQ + + Remarks: + - Module weights must have the right sizes wrt the block size + - Only Linear, Embedding and Conv2d modules are supported for the moment + - For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down: + Revisiting the Quantization of Neural Networks" + - We implement the simplest form of noise here as stated in the paper which consists in randomly dropping + blocks + """ + + # if no quantization noise, don't register hook + if p <= 0: + return module + + # supported modules + if not isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)): + raise NotImplementedError("Module unsupported for quant_noise.") + + # test whether module.weight has the right sizes wrt block_size + is_conv = module.weight.ndim == 4 + + # 2D matrix + if not is_conv: + if module.weight.size(1) % block_size != 0: + raise AssertionError("Input features must be a multiple of block sizes") + + # 4D matrix + else: + # 1x1 convolutions + if module.kernel_size == (1, 1): + if module.in_channels % block_size != 0: + raise AssertionError("Input channels must be a multiple of block sizes") + # regular convolutions + else: + k = module.kernel_size[0] * module.kernel_size[1] + if k % block_size != 0: + raise AssertionError("Kernel size must be a multiple of block size") + + def _forward_pre_hook(mod, input): + # no noise for evaluation + if mod.training: + if not is_conv: + # gather weight and sizes + weight = mod.weight + in_features = weight.size(1) + out_features = weight.size(0) + + # split weight matrix into blocks and randomly drop selected blocks + mask = torch.zeros(in_features // block_size * out_features, device=weight.device) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) + + else: + # gather weight and sizes + weight = mod.weight + in_channels = mod.in_channels + out_channels = mod.out_channels + + # split weight matrix into blocks and randomly drop selected blocks + if mod.kernel_size == (1, 1): + mask = torch.zeros( + int(in_channels // block_size * out_channels), + device=weight.device, + ) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) + else: + mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device) + mask.bernoulli_(p) + mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]) + + # scale weights and apply mask + mask = mask.to(torch.bool) # x.bool() is not currently supported in TorchScript + s = 1 / (1 - p) + mod.weight.data = s * weight.masked_fill(mask, 0) + + module.register_forward_pre_hook(_forward_pre_hook) + return module + + +class LayerDropModuleList(nn.ModuleList): + """ + From: + https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/layer_drop.py + A LayerDrop implementation based on [`torch.nn.ModuleList`]. LayerDrop as described in + https://huggingface.co/papers/1909.11556. + + We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During + evaluation we always iterate over all layers. + + Usage: + + ```python + layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3]) + for layer in layers: # this might iterate over layers 1 and 3 + x = layer(x) + for layer in layers: # this might iterate over all layers + x = layer(x) + for layer in layers: # this might not iterate over any layers + x = layer(x) + ``` + + Args: + p (float): probability of dropping out each layer + modules (iterable, optional): an iterable of modules to add + """ + + def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None): + super().__init__(modules) + self.p = p + + def __iter__(self) -> Iterator[nn.Module]: + dropout_probs = torch.empty(len(self)).uniform_() + for i, m in enumerate(super().__iter__()): + if not self.training or (dropout_probs[i] > self.p): + yield m + + +class GraphormerGraphNodeFeature(nn.Module): + """ + Compute node features for each node in the graph. + """ + + def __init__(self, config: GraphormerConfig): + super().__init__() + self.num_heads = config.num_attention_heads + self.num_atoms = config.num_atoms + + self.atom_encoder = nn.Embedding(config.num_atoms + 1, config.hidden_size, padding_idx=config.pad_token_id) + self.in_degree_encoder = nn.Embedding( + config.num_in_degree, config.hidden_size, padding_idx=config.pad_token_id + ) + self.out_degree_encoder = nn.Embedding( + config.num_out_degree, config.hidden_size, padding_idx=config.pad_token_id + ) + + self.graph_token = nn.Embedding(1, config.hidden_size) + + def forward( + self, + input_nodes: torch.LongTensor, + in_degree: torch.LongTensor, + out_degree: torch.LongTensor, + ) -> torch.Tensor: + n_graph, n_node = input_nodes.size()[:2] + + node_feature = ( # node feature + graph token + self.atom_encoder(input_nodes).sum(dim=-2) # [n_graph, n_node, n_hidden] + + self.in_degree_encoder(in_degree) + + self.out_degree_encoder(out_degree) + ) + + graph_token_feature = self.graph_token.weight.unsqueeze(0).repeat(n_graph, 1, 1) + + graph_node_feature = torch.cat([graph_token_feature, node_feature], dim=1) + + return graph_node_feature + + +class GraphormerGraphAttnBias(nn.Module): + """ + Compute attention bias for each head. + """ + + def __init__(self, config: GraphormerConfig): + super().__init__() + self.num_heads = config.num_attention_heads + self.multi_hop_max_dist = config.multi_hop_max_dist + + # We do not change edge feature embedding learning, as edge embeddings are represented as a combination of the original features + # + shortest path + self.edge_encoder = nn.Embedding(config.num_edges + 1, config.num_attention_heads, padding_idx=0) + + self.edge_type = config.edge_type + if self.edge_type == "multi_hop": + self.edge_dis_encoder = nn.Embedding( + config.num_edge_dis * config.num_attention_heads * config.num_attention_heads, + 1, + ) + + self.spatial_pos_encoder = nn.Embedding(config.num_spatial, config.num_attention_heads, padding_idx=0) + + self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads) + + def forward( + self, + input_nodes: torch.LongTensor, + attn_bias: torch.Tensor, + spatial_pos: torch.LongTensor, + input_edges: torch.LongTensor, + attn_edge_type: torch.LongTensor, + ) -> torch.Tensor: + n_graph, n_node = input_nodes.size()[:2] + graph_attn_bias = attn_bias.clone() + graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat( + 1, self.num_heads, 1, 1 + ) # [n_graph, n_head, n_node+1, n_node+1] + + # spatial pos + # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2) + graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + spatial_pos_bias + + # reset spatial pos here + t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1) + graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t + graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t + + # edge feature + if self.edge_type == "multi_hop": + spatial_pos_ = spatial_pos.clone() + + spatial_pos_[spatial_pos_ == 0] = 1 # set pad to 1 + # set 1 to 1, input_nodes > 1 to input_nodes - 1 + spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_) + if self.multi_hop_max_dist > 0: + spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist) + input_edges = input_edges[:, :, :, : self.multi_hop_max_dist, :] + # [n_graph, n_node, n_node, max_dist, n_head] + + input_edges = self.edge_encoder(input_edges).mean(-2) + max_dist = input_edges.size(-2) + edge_input_flat = input_edges.permute(3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads) + edge_input_flat = torch.bmm( + edge_input_flat, + self.edge_dis_encoder.weight.reshape(-1, self.num_heads, self.num_heads)[:max_dist, :, :], + ) + input_edges = edge_input_flat.reshape(max_dist, n_graph, n_node, n_node, self.num_heads).permute( + 1, 2, 3, 0, 4 + ) + input_edges = (input_edges.sum(-2) / (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2) + else: + # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node] + input_edges = self.edge_encoder(attn_edge_type).mean(-2).permute(0, 3, 1, 2) + + graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + input_edges + graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1) # reset + + return graph_attn_bias + + +class GraphormerMultiheadAttention(nn.Module): + """Multi-headed attention. + + See "Attention Is All You Need" for more details. + """ + + def __init__(self, config: GraphormerConfig): + super().__init__() + self.embedding_dim = config.embedding_dim + self.kdim = config.kdim if config.kdim is not None else config.embedding_dim + self.vdim = config.vdim if config.vdim is not None else config.embedding_dim + self.qkv_same_dim = self.kdim == config.embedding_dim and self.vdim == config.embedding_dim + + self.num_heads = config.num_attention_heads + self.attention_dropout_module = torch.nn.Dropout(p=config.attention_dropout, inplace=False) + + self.head_dim = config.embedding_dim // config.num_attention_heads + if not (self.head_dim * config.num_attention_heads == self.embedding_dim): + raise AssertionError("The embedding_dim must be divisible by num_heads.") + self.scaling = self.head_dim**-0.5 + + self.self_attention = True # config.self_attention + if not (self.self_attention): + raise NotImplementedError("The Graphormer model only supports self attention for now.") + if self.self_attention and not self.qkv_same_dim: + raise AssertionError("Self-attention requires query, key and value to be of the same size.") + + self.k_proj = quant_noise( + nn.Linear(self.kdim, config.embedding_dim, bias=config.bias), + config.q_noise, + config.qn_block_size, + ) + self.v_proj = quant_noise( + nn.Linear(self.vdim, config.embedding_dim, bias=config.bias), + config.q_noise, + config.qn_block_size, + ) + self.q_proj = quant_noise( + nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias), + config.q_noise, + config.qn_block_size, + ) + + self.out_proj = quant_noise( + nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias), + config.q_noise, + config.qn_block_size, + ) + + self.onnx_trace = False + + def reset_parameters(self): + if self.qkv_same_dim: + # Empirically observed the convergence to be much better with + # the scaled initialization + nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) + else: + nn.init.xavier_uniform_(self.k_proj.weight) + nn.init.xavier_uniform_(self.v_proj.weight) + nn.init.xavier_uniform_(self.q_proj.weight) + + nn.init.xavier_uniform_(self.out_proj.weight) + if self.out_proj.bias is not None: + nn.init.constant_(self.out_proj.bias, 0.0) + + def forward( + self, + query: torch.LongTensor, + key: Optional[torch.Tensor], + value: Optional[torch.Tensor], + attn_bias: Optional[torch.Tensor], + key_padding_mask: Optional[torch.Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[torch.Tensor] = None, + before_softmax: bool = False, + need_head_weights: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + Args: + key_padding_mask (Bytetorch.Tensor, optional): mask to exclude + keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s. + need_weights (bool, optional): return the attention weights, + averaged over heads (default: False). + attn_mask (Bytetorch.Tensor, optional): typically used to + implement causal attention, where the mask prevents the attention from looking forward in time + (default: None). + before_softmax (bool, optional): return the raw attention + weights and values before the attention softmax. + need_head_weights (bool, optional): return the attention + weights for each head. Implies *need_weights*. Default: return the average attention weights over all + heads. + """ + if need_head_weights: + need_weights = True + + tgt_len, bsz, embedding_dim = query.size() + src_len = tgt_len + if not (embedding_dim == self.embedding_dim): + raise AssertionError( + f"The query embedding dimension {embedding_dim} is not equal to the expected embedding_dim" + f" {self.embedding_dim}." + ) + if not (list(query.size()) == [tgt_len, bsz, embedding_dim]): + raise AssertionError("Query size incorrect in Graphormer, compared to model dimensions.") + + if key is not None: + src_len, key_bsz, _ = key.size() + if not torch.jit.is_scripting(): + if (key_bsz != bsz) or (value is None) or not (src_len, bsz == value.shape[:2]): + raise AssertionError( + "The batch shape does not match the key or value shapes provided to the attention." + ) + + q = self.q_proj(query) + k = self.k_proj(query) + v = self.v_proj(query) + + q *= self.scaling + + q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) + if k is not None: + k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) + if v is not None: + v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) + + if (k is None) or not (k.size(1) == src_len): + raise AssertionError("The shape of the key generated in the attention is incorrect") + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.dim() == 0: + key_padding_mask = None + + if key_padding_mask is not None: + if key_padding_mask.size(0) != bsz or key_padding_mask.size(1) != src_len: + raise AssertionError( + "The shape of the generated padding mask for the key does not match expected dimensions." + ) + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + + if list(attn_weights.size()) != [bsz * self.num_heads, tgt_len, src_len]: + raise AssertionError("The attention weights generated do not match the expected dimensions.") + + if attn_bias is not None: + attn_weights += attn_bias.view(bsz * self.num_heads, tgt_len, src_len) + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0) + attn_weights += attn_mask + + if key_padding_mask is not None: + # don't attend to padding symbols + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf") + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if before_softmax: + return attn_weights, v + + attn_weights_float = torch.nn.functional.softmax(attn_weights, dim=-1) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = self.attention_dropout_module(attn_weights) + + if v is None: + raise AssertionError("No value generated") + attn = torch.bmm(attn_probs, v) + if list(attn.size()) != [bsz * self.num_heads, tgt_len, self.head_dim]: + raise AssertionError("The attention generated do not match the expected dimensions.") + + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embedding_dim) + attn: torch.Tensor = self.out_proj(attn) + + attn_weights = None + if need_weights: + attn_weights = attn_weights_float.contiguous().view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0) + if not need_head_weights: + # average attention weights over heads + attn_weights = attn_weights.mean(dim=0) + + return attn, attn_weights + + def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor: + return attn_weights + + +class GraphormerGraphEncoderLayer(nn.Module): + def __init__(self, config: GraphormerConfig) -> None: + super().__init__() + + # Initialize parameters + self.embedding_dim = config.embedding_dim + self.num_attention_heads = config.num_attention_heads + self.q_noise = config.q_noise + self.qn_block_size = config.qn_block_size + self.pre_layernorm = config.pre_layernorm + + self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False) + + self.activation_dropout_module = torch.nn.Dropout(p=config.activation_dropout, inplace=False) + + # Initialize blocks + self.activation_fn = ACT2FN[config.activation_fn] + self.self_attn = GraphormerMultiheadAttention(config) + + # layer norm associated with the self attention layer + self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim) + + self.fc1 = self.build_fc( + self.embedding_dim, + config.ffn_embedding_dim, + q_noise=config.q_noise, + qn_block_size=config.qn_block_size, + ) + self.fc2 = self.build_fc( + config.ffn_embedding_dim, + self.embedding_dim, + q_noise=config.q_noise, + qn_block_size=config.qn_block_size, + ) + + # layer norm associated with the position wise feed-forward NN + self.final_layer_norm = nn.LayerNorm(self.embedding_dim) + + def build_fc( + self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int + ) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]: + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def forward( + self, + input_nodes: torch.Tensor, + self_attn_bias: Optional[torch.Tensor] = None, + self_attn_mask: Optional[torch.Tensor] = None, + self_attn_padding_mask: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + nn.LayerNorm is applied either before or after the self-attention/ffn modules similar to the original + Transformer implementation. + """ + residual = input_nodes + if self.pre_layernorm: + input_nodes = self.self_attn_layer_norm(input_nodes) + + input_nodes, attn = self.self_attn( + query=input_nodes, + key=input_nodes, + value=input_nodes, + attn_bias=self_attn_bias, + key_padding_mask=self_attn_padding_mask, + need_weights=False, + attn_mask=self_attn_mask, + ) + input_nodes = self.dropout_module(input_nodes) + input_nodes = residual + input_nodes + if not self.pre_layernorm: + input_nodes = self.self_attn_layer_norm(input_nodes) + + residual = input_nodes + if self.pre_layernorm: + input_nodes = self.final_layer_norm(input_nodes) + input_nodes = self.activation_fn(self.fc1(input_nodes)) + input_nodes = self.activation_dropout_module(input_nodes) + input_nodes = self.fc2(input_nodes) + input_nodes = self.dropout_module(input_nodes) + input_nodes = residual + input_nodes + if not self.pre_layernorm: + input_nodes = self.final_layer_norm(input_nodes) + + return input_nodes, attn + + +class GraphormerGraphEncoder(nn.Module): + def __init__(self, config: GraphormerConfig): + super().__init__() + + self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False) + self.layerdrop = config.layerdrop + self.embedding_dim = config.embedding_dim + self.apply_graphormer_init = config.apply_graphormer_init + self.traceable = config.traceable + + self.graph_node_feature = GraphormerGraphNodeFeature(config) + self.graph_attn_bias = GraphormerGraphAttnBias(config) + + self.embed_scale = config.embed_scale + + if config.q_noise > 0: + self.quant_noise = quant_noise( + nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), + config.q_noise, + config.qn_block_size, + ) + else: + self.quant_noise = None + + if config.encoder_normalize_before: + self.emb_layer_norm = nn.LayerNorm(self.embedding_dim) + else: + self.emb_layer_norm = None + + if config.pre_layernorm: + self.final_layer_norm = nn.LayerNorm(self.embedding_dim) + + if self.layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend([GraphormerGraphEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + + # Apply initialization of model params after building the model + if config.freeze_embeddings: + raise NotImplementedError("Freezing embeddings is not implemented yet.") + + for layer in range(config.num_trans_layers_to_freeze): + m = self.layers[layer] + if m is not None: + for p in m.parameters(): + p.requires_grad = False + + def forward( + self, + input_nodes: torch.LongTensor, + input_edges: torch.LongTensor, + attn_bias: torch.Tensor, + in_degree: torch.LongTensor, + out_degree: torch.LongTensor, + spatial_pos: torch.LongTensor, + attn_edge_type: torch.LongTensor, + perturb=None, + last_state_only: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + attn_mask: Optional[torch.Tensor] = None, + ) -> tuple[Union[torch.Tensor, list[torch.LongTensor]], torch.Tensor]: + # compute padding mask. This is needed for multi-head attention + data_x = input_nodes + n_graph, n_node = data_x.size()[:2] + padding_mask = (data_x[:, :, 0]).eq(0) + padding_mask_cls = torch.zeros(n_graph, 1, device=padding_mask.device, dtype=padding_mask.dtype) + padding_mask = torch.cat((padding_mask_cls, padding_mask), dim=1) + + attn_bias = self.graph_attn_bias(input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type) + + if token_embeddings is not None: + input_nodes = token_embeddings + else: + input_nodes = self.graph_node_feature(input_nodes, in_degree, out_degree) + + if perturb is not None: + input_nodes[:, 1:, :] += perturb + + if self.embed_scale is not None: + input_nodes = input_nodes * self.embed_scale + + if self.quant_noise is not None: + input_nodes = self.quant_noise(input_nodes) + + if self.emb_layer_norm is not None: + input_nodes = self.emb_layer_norm(input_nodes) + + input_nodes = self.dropout_module(input_nodes) + + input_nodes = input_nodes.transpose(0, 1) + + inner_states = [] + if not last_state_only: + inner_states.append(input_nodes) + + for layer in self.layers: + input_nodes, _ = layer( + input_nodes, + self_attn_padding_mask=padding_mask, + self_attn_mask=attn_mask, + self_attn_bias=attn_bias, + ) + if not last_state_only: + inner_states.append(input_nodes) + + graph_rep = input_nodes[0, :, :] + + if last_state_only: + inner_states = [input_nodes] + + if self.traceable: + return torch.stack(inner_states), graph_rep + else: + return inner_states, graph_rep + + +class GraphormerDecoderHead(nn.Module): + def __init__(self, embedding_dim: int, num_classes: int): + super().__init__() + """num_classes should be 1 for regression, or the number of classes for classification""" + self.lm_output_learned_bias = nn.Parameter(torch.zeros(1)) + self.classifier = nn.Linear(embedding_dim, num_classes, bias=False) + self.num_classes = num_classes + + def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor: + input_nodes = self.classifier(input_nodes) + input_nodes = input_nodes + self.lm_output_learned_bias + return input_nodes + + +class GraphormerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: GraphormerConfig + base_model_prefix = "graphormer" + main_input_name_nodes = "input_nodes" + main_input_name_edges = "input_edges" + + def normal_(self, data: torch.Tensor): + # with FSDP, module params will be on CUDA, so we cast them back to CPU + # so that the RNG is consistent with and without FSDP + data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device)) + + def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]): + """ + Initialize the weights specific to the Graphormer Model. + """ + if isinstance(module, nn.Linear): + self.normal_(module.weight.data) + if module.bias is not None: + module.bias.data.zero_() + if isinstance(module, nn.Embedding): + self.normal_(module.weight.data) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if isinstance(module, GraphormerMultiheadAttention): + self.normal_(module.q_proj.weight.data) + self.normal_(module.k_proj.weight.data) + self.normal_(module.v_proj.weight.data) + + def _init_weights( + self, + module: Union[ + nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder + ], + ): + """ + Initialize the weights + """ + if isinstance(module, (nn.Linear, nn.Conv2d)): + # We might be missing part of the Linear init, dependent on the layer num + module.weight.data.normal_(mean=0.0, std=0.02) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=0.02) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, GraphormerMultiheadAttention): + module.q_proj.weight.data.normal_(mean=0.0, std=0.02) + module.k_proj.weight.data.normal_(mean=0.0, std=0.02) + module.v_proj.weight.data.normal_(mean=0.0, std=0.02) + module.reset_parameters() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, GraphormerGraphEncoder): + if module.apply_graphormer_init: + module.apply(self.init_graphormer_params) + + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class GraphormerModel(GraphormerPreTrainedModel): + """The Graphormer model is a graph-encoder model. + + It goes from a graph to its representation. If you want to use the model for a downstream classification task, use + GraphormerForGraphClassification instead. For any other downstream task, feel free to add a new class, or combine + this model with a downstream model of your choice, following the example in GraphormerForGraphClassification. + """ + + def __init__(self, config: GraphormerConfig): + super().__init__(config) + self.max_nodes = config.max_nodes + + self.graph_encoder = GraphormerGraphEncoder(config) + + self.share_input_output_embed = config.share_input_output_embed + self.lm_output_learned_bias = None + + # Remove head is set to true during fine-tuning + self.load_softmax = not getattr(config, "remove_head", False) + + self.lm_head_transform_weight = nn.Linear(config.embedding_dim, config.embedding_dim) + self.activation_fn = ACT2FN[config.activation_fn] + self.layer_norm = nn.LayerNorm(config.embedding_dim) + + self.post_init() + + def reset_output_layer_parameters(self): + self.lm_output_learned_bias = nn.Parameter(torch.zeros(1)) + + def forward( + self, + input_nodes: torch.LongTensor, + input_edges: torch.LongTensor, + attn_bias: torch.Tensor, + in_degree: torch.LongTensor, + out_degree: torch.LongTensor, + spatial_pos: torch.LongTensor, + attn_edge_type: torch.LongTensor, + perturb: Optional[torch.FloatTensor] = None, + masked_tokens: None = None, + return_dict: Optional[bool] = None, + **unused, + ) -> Union[tuple[torch.LongTensor], BaseModelOutputWithNoAttention]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + inner_states, graph_rep = self.graph_encoder( + input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type, perturb=perturb + ) + + # last inner state, then revert Batch and Graph len + input_nodes = inner_states[-1].transpose(0, 1) + + # project masked tokens only + if masked_tokens is not None: + raise NotImplementedError + + input_nodes = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(input_nodes))) + + # project back to size of vocabulary + if self.share_input_output_embed and hasattr(self.graph_encoder.embed_tokens, "weight"): + input_nodes = torch.nn.functional.linear(input_nodes, self.graph_encoder.embed_tokens.weight) + + if not return_dict: + return tuple(x for x in [input_nodes, inner_states] if x is not None) + return BaseModelOutputWithNoAttention(last_hidden_state=input_nodes, hidden_states=inner_states) + + def max_nodes(self): + """Maximum output length supported by the encoder.""" + return self.max_nodes + + +class GraphormerForGraphClassification(GraphormerPreTrainedModel): + """ + This model can be used for graph-level classification or regression tasks. + + It can be trained on + - regression (by setting config.num_classes to 1); there should be one float-type label per graph + - one task classification (by setting config.num_classes to the number of classes); there should be one integer + label per graph + - binary multi-task classification (by setting config.num_classes to the number of labels); there should be a list + of integer labels for each graph. + """ + + def __init__(self, config: GraphormerConfig): + super().__init__(config) + self.encoder = GraphormerModel(config) + self.embedding_dim = config.embedding_dim + self.num_classes = config.num_classes + self.classifier = GraphormerDecoderHead(self.embedding_dim, self.num_classes) + self.is_encoder_decoder = True + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_nodes: torch.LongTensor, + input_edges: torch.LongTensor, + attn_bias: torch.Tensor, + in_degree: torch.LongTensor, + out_degree: torch.LongTensor, + spatial_pos: torch.LongTensor, + attn_edge_type: torch.LongTensor, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + **unused, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_outputs = self.encoder( + input_nodes, + input_edges, + attn_bias, + in_degree, + out_degree, + spatial_pos, + attn_edge_type, + return_dict=True, + ) + outputs, hidden_states = encoder_outputs["last_hidden_state"], encoder_outputs["hidden_states"] + + head_outputs = self.classifier(outputs) + logits = head_outputs[:, 0, :].contiguous() + + loss = None + if labels is not None: + mask = ~torch.isnan(labels) + + if self.num_classes == 1: # regression + loss_fct = MSELoss() + loss = loss_fct(logits[mask].squeeze(), labels[mask].squeeze().float()) + elif self.num_classes > 1 and len(labels.shape) == 1: # One task classification + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits[mask].view(-1, self.num_classes), labels[mask].view(-1)) + else: # Binary multi-task classification + loss_fct = BCEWithLogitsLoss(reduction="sum") + loss = loss_fct(logits[mask], labels[mask]) + + if not return_dict: + return tuple(x for x in [loss, logits, hidden_states] if x is not None) + return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states, attentions=None) + + +__all__ = ["GraphormerForGraphClassification", "GraphormerModel", "GraphormerPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..826bdbddc1f182de43a795ab0d78ad4009507c14 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_jukebox import * + from .modeling_jukebox import * + from .tokenization_jukebox import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py new file mode 100644 index 0000000000000000000000000000000000000000..9bb2e3da54aa232bf934a7922f731796a7146937 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py @@ -0,0 +1,613 @@ +# coding=utf-8 +# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Jukebox configuration""" + +import os +from typing import Union + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +_LARGE_ATTENTION = [ + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "cross_attention", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "cross_attention", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "cross_attention", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "cross_attention", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "cross_attention", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "cross_attention", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "block_attn", + "transpose_block_attn", + "prev_block_attn", + "cross_attention", +] +_RawColumnPreviousRowAttention = ["block_attn", "transpose_block_attn", "prev_block_attn"] +_FullDenseAttention = ["dense_attention"] +_PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"] + + +def full_dense_attention(layer): + return _FullDenseAttention[0] + + +def raw_column_previous_row_attention(layer): + return _RawColumnPreviousRowAttention[layer % 3] + + +def large_separated_enc_dec_w_lyrics(layer): + return _LARGE_ATTENTION[layer % 79] + + +def enc_dec_with_lyrics(layer): + if layer % 16 == 15: + return _PrimePrimeDenseAttention[layer % 3] + return _RawColumnPreviousRowAttention[layer % 3] + + +ATTENTION_PATTERNS = { + "full_dense_attention": full_dense_attention, + "raw_column_previous_row_attention": raw_column_previous_row_attention, # Alternate row, column and previous row attn + "large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics, # Used by large separated_enc_dec model with lyrics + "enc_dec_with_lyrics": enc_dec_with_lyrics, # Used by encoder_decoder model with lyrics +} + + +class JukeboxPriorConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a + `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the top level prior from the + [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox + -1b-lyrics) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + + Args: + act_fn (`str`, *optional*, defaults to `"quick_gelu"`): + Activation function. + alignment_head (`int`, *optional*, defaults to 2): + Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio + alignment + alignment_layer (`int`, *optional*, defaults to 68): + Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the + lyric to audio alignment + attention_multiplier (`float`, *optional*, defaults to 0.25): + Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that + 0.25*width of the model will be used. + attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`): + Which attention pattern to use for the decoder/ + attn_dropout (`int`, *optional*, defaults to 0): + Dropout probability for the post-attention layer dropout in the decoder. + attn_res_scale (`bool`, *optional*, defaults to `False`): + Whether or not to scale the residuals in the attention conditioner block. + blocks (`int`, *optional*, defaults to 64): + Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len // + blocks]` in the `JukeboxAttention` layer. + conv_res_scale (`int`, *optional*): + Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a + conditioner, the default value is to None and should not be modified. + num_layers (`int`, *optional*, defaults to 72): + Number of layers of the transformer architecture. + emb_dropout (`int`, *optional*, defaults to 0): + Embedding dropout used in the lyric decoder. + encoder_config (`JukeboxPriorConfig`, *optional*) : + Configuration of the encoder which models the prior on the lyrics. + encoder_loss_fraction (`float`, *optional*, defaults to 0.4): + Multiplication factor used in front of the lyric encoder loss. + hidden_size (`int`, *optional*, defaults to 2048): + Hidden dimension of the attention layers. + init_scale (`float`, *optional*, defaults to 0.2): + Initialization scales for the prior modules. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is + greater than 0, the `encoder` args should be specified for the lyric encoding. + mask (`bool`, *optional*, defaults to `False`): + Whether or not to mask the previous positions in the attention. + max_duration (`int`, *optional*, defaults to 600): + Maximum supported duration of the generated song in seconds. + max_nb_genres (`int`, *optional*, defaults to 1): + Maximum number of genres that can be used to condition the model. + merged_decoder (`bool`, *optional*, defaults to `True`): + Whether or not the decoder and the encoder inputs are merged. This is used for the separated + encoder-decoder architecture + metadata_conditioning (`bool`, *optional*, defaults to `True)`: + Whether or not to condition on the artist and genre metadata. + metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`): + Number of genres and the number of artists that were used to train the embedding layers of the prior + models. + min_duration (`int`, *optional*, defaults to 0): + Minimum duration of the generated audio on which the model was trained. + mlp_multiplier (`float`, *optional*, defaults to 1.0): + Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of + the model will be used. + music_vocab_size (`int`, *optional*, defaults to 2048): + Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`. + n_ctx (`int`, *optional*, defaults to 6144): + Number of context tokens for each prior. The context tokens are the music tokens that are attended to when + generating music tokens. + n_heads (`int`, *optional*, defaults to 2): + Number of attention heads. + nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384): + Number of lyric tokens that are used when sampling a single window of length `n_ctx` + res_conv_depth (`int`, *optional*, defaults to 3): + Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the + `JukeboxMusicTokenConditioner`. + res_conv_width (`int`, *optional*, defaults to 128): + Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the + `JukeboxMusicTokenConditioner`. + res_convolution_multiplier (`int`, *optional*, defaults to 1): + Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`. + res_dilation_cycle (`int`, *optional*): + Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the + corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level + tokens. + res_dilation_growth_rate (`int`, *optional*, defaults to 1): + Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner` + res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`): + Downsampling rates used in the audio conditioning network + res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`): + Striding used in the audio conditioning network + resid_dropout (`int`, *optional*, defaults to 0): + Residual dropout used in the attention pattern. + sampling_rate (`int`, *optional*, defaults to 44100): + Sampling rate used for training. + spread (`int`, *optional*): + Spread used in the `summary_spread_attention` pattern + timing_dims (`int`, *optional*, defaults to 64): + Dimension of the timing embedding. + zero_out (`bool`, *optional*, defaults to `False`): + Whether or not to zero out convolution weights when initializing. + """ + + model_type = "jukebox_prior" + attribute_map = { + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + } + + def __init__( + self, + act_fn="quick_gelu", + level=0, + alignment_head=2, + alignment_layer=68, + attention_multiplier=0.25, + attention_pattern="enc_dec_with_lyrics", + attn_dropout=0, + attn_res_scale=False, + blocks=64, + conv_res_scale=None, + num_layers=72, + emb_dropout=0, + encoder_config=None, + encoder_loss_fraction=0.4, + hidden_size=2048, + init_scale=0.2, + is_encoder_decoder=True, + lyric_vocab_size=80, + mask=False, + max_duration=600, + max_nb_genres=1, + merged_decoder=True, + metadata_conditioning=True, + metadata_dims=[604, 7898], + min_duration=0, + mlp_multiplier=1.0, + music_vocab_size=2048, + n_ctx=6144, + n_heads=2, + nb_relevant_lyric_tokens=384, + res_conv_depth=3, + res_conv_width=128, + res_convolution_multiplier=1, + res_dilation_cycle=None, + res_dilation_growth_rate=1, + res_downs_t=[3, 2, 2], + res_strides_t=[2, 2, 2], + resid_dropout=0, + sampling_rate=44100, + spread=None, + timing_dims=64, + zero_out=False, + **kwargs, + ): + super().__init__(**kwargs) + self.act_fn = act_fn + self.alignment_head = alignment_head + self.alignment_layer = alignment_layer + self.attention_multiplier = attention_multiplier + self.attention_pattern = attention_pattern + self.attn_dropout = attn_dropout + self.attn_res_scale = attn_res_scale + self.blocks = blocks + self.conv_res_scale = conv_res_scale + self.num_layers = num_layers + self.emb_dropout = emb_dropout + self.music_vocab_size = music_vocab_size + if encoder_config is not None: + self.encoder_config = JukeboxPriorConfig(**encoder_config) + else: + self.encoder_config = None + self.encoder_loss_fraction = encoder_loss_fraction + self.init_scale = init_scale + self.is_encoder_decoder = is_encoder_decoder + self.lyric_vocab_size = lyric_vocab_size + self.level = level + self.mask = mask + self.max_duration = max_duration + self.max_nb_genres = max_nb_genres + self.merged_decoder = merged_decoder + self.metadata_conditioning = metadata_conditioning + self.metadata_dims = metadata_dims + self.min_duration = min_duration + self.mlp_multiplier = mlp_multiplier + self.n_ctx = n_ctx + self.n_heads = n_heads + self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens + self.res_conv_depth = res_conv_depth + self.res_conv_width = res_conv_width + self.res_convolution_multiplier = res_convolution_multiplier + self.res_dilation_cycle = res_dilation_cycle + self.res_dilation_growth_rate = res_dilation_growth_rate + self.res_downs_t = res_downs_t + self.res_strides_t = res_strides_t + self.resid_dropout = resid_dropout + self.sampling_rate = sampling_rate + self.spread = spread + self.timing_dims = timing_dims + self.hidden_size = hidden_size + self.zero_out = zero_out + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs): + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the prior config dict if we are loading from JukeboxConfig + if config_dict.get("model_type") == "jukebox": + config_dict = config_dict[f"prior_{level}"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class JukeboxVQVAEConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a + `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the VQVAE from + [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + act_fn (`str`, *optional*, defaults to `"relu"`): + Activation function of the model. + nb_discrete_codes (`int`, *optional*, defaults to 2048): + Number of codes of the VQVAE. + commit (`float`, *optional*, defaults to 0.02): + Commit loss multiplier. + conv_input_shape (`int`, *optional*, defaults to 1): + Number of audio channels. + conv_res_scale (`bool`, *optional*, defaults to `False`): + Whether or not to scale the residuals of the `JukeboxResConv1DBlock`. + embed_dim (`int`, *optional*, defaults to 64): + Embedding dimension of the codebook vectors. + hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`): + Fraction of non-intersecting window used when continuing the sampling process. + levels (`int`, *optional*, defaults to 3): + Number of hierarchical levels that used in the VQVAE. + lmu (`float`, *optional*, defaults to 0.99): + Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1 + of the original [VQVAE paper](https://huggingface.co/papers/1711.00937v2.pdf) + multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`): + Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth` + res_conv_depth (`int`, *optional*, defaults to 4): + Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level. + res_conv_width (`int`, *optional*, defaults to 32): + Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level. + res_convolution_multiplier (`int`, *optional*, defaults to 1): + Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`. + res_dilation_cycle (`int`, *optional*): + Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth + reduced by a power of `res_dilation_cycle`. + res_dilation_growth_rate (`int`, *optional*, defaults to 3): + Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth) + res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`): + Downsampling rate for each level of the hierarchical VQ-VAE. + res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`): + Stride used for each level of the hierarchical VQ-VAE. + sample_length (`int`, *optional*, defaults to 1058304): + Provides the max input shape of the VQVAE. Is used to compute the input shape of each level. + init_scale (`float`, *optional*, defaults to 0.2): + Initialization scale. + zero_out (`bool`, *optional*, defaults to `False`): + Whether or not to zero out convolution weights when initializing. + """ + + model_type = "jukebox_vqvae" + + def __init__( + self, + act_fn="relu", + nb_discrete_codes=2048, + commit=0.02, + conv_input_shape=1, + conv_res_scale=False, + embed_dim=64, + hop_fraction=[0.125, 0.5, 0.5], + levels=3, + lmu=0.99, + multipliers=[2, 1, 1], + res_conv_depth=4, + res_conv_width=32, + res_convolution_multiplier=1, + res_dilation_cycle=None, + res_dilation_growth_rate=3, + res_downs_t=[3, 2, 2], + res_strides_t=[2, 2, 2], + sample_length=1058304, + init_scale=0.2, + zero_out=False, + **kwargs, + ): + super().__init__(**kwargs) + self.hop_fraction = hop_fraction + self.conv_input_shape = conv_input_shape + self.sample_length = sample_length + + # VQVAE parameters (all used) + self.levels = levels + self.embed_dim = embed_dim + self.nb_discrete_codes = nb_discrete_codes + self.res_conv_width = res_conv_width + self.res_conv_depth = res_conv_depth + self.res_convolution_multiplier = res_convolution_multiplier + self.res_dilation_growth_rate = res_dilation_growth_rate + self.res_dilation_cycle = res_dilation_cycle + self.multipliers = multipliers + self.res_downs_t = res_downs_t + self.res_strides_t = res_strides_t + self.lmu = lmu + self.commit = commit + self.conv_res_scale = conv_res_scale + self.act_fn = act_fn + self.init_scale = init_scale + self.zero_out = zero_out + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs): + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the text config dict if we are loading from CLIPConfig + if config_dict.get("model_type") == "jukebox": + config_dict = config_dict["vqvae_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class JukeboxConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`JukeboxModel`]. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will + yield a similar configuration to that of + [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture. + + + The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling = + (5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256 + to get the second level codes. This is mostly true for training the top level prior and the upsamplers. + + Args: + vqvae_config (`JukeboxVQVAEConfig`, *optional*): + Configuration for the `JukeboxVQVAE` model. + prior_config_list (`List[JukeboxPriorConfig]`, *optional*): + List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors. + nb_priors (`int`, *optional*, defaults to 3): + Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive + (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were + trained using a top prior and 2 upsampler priors. + sampling_rate (`int`, *optional*, defaults to 44100): + Sampling rate of the raw audio. + timing_dims (`int`, *optional*, defaults to 64): + Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding + layer. The timing embedding layer converts the absolute and relative position in the currently sampled + audio to a tensor of length `timing_dims` that will be added to the music tokens. + min_duration (`int`, *optional*, defaults to 0): + Minimum duration of the audios to generate + max_duration (`float`, *optional*, defaults to 600.0): + Maximum duration of the audios to generate + max_nb_genres (`int`, *optional*, defaults to 5): + Maximum number of genres that can be used to condition a single sample. + metadata_conditioning (`bool`, *optional*, defaults to `True`): + Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum + duration. + + Example: + + ```python + >>> from transformers import JukeboxModel, JukeboxConfig + + >>> # Initializing a Jukebox configuration + >>> configuration = JukeboxConfig() + + >>> # Initializing a model from the configuration + >>> model = JukeboxModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "jukebox" + + def __init__( + self, + vqvae_config=None, + prior_config_list=None, + nb_priors=3, + sampling_rate=44100, + timing_dims=64, + min_duration=0, + max_duration=600.0, + max_nb_genres=5, + metadata_conditioning=True, + **kwargs, + ): + if vqvae_config is None: + vqvae_config = {} + logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.") + + self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config) + if prior_config_list is not None: + self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list] + else: + self.prior_configs = [] + for prior_idx in range(nb_priors): + prior_config = kwargs.pop(f"prior_{prior_idx}", None) + if prior_config is None: + prior_config = {} + logger.info( + f"prior_{prior_idx}'s config is None. Initializing the JukeboxPriorConfig list with default" + " values." + ) + self.prior_configs.append(JukeboxPriorConfig(**prior_config)) + + self.hop_fraction = self.vqvae_config.hop_fraction + + self.nb_priors = nb_priors + + # Metadata conditioning + self.max_nb_genres = max_nb_genres + self.sampling_rate = sampling_rate + self.timing_dims = timing_dims + self.min_duration = min_duration + self.max_duration = max_duration + self.metadata_conditioning = metadata_conditioning + + super().__init__(**kwargs) + + @classmethod + def from_configs(cls, prior_configs: list[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs): + r""" + Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model + configuration. + + Returns: + [`JukeboxConfig`]: An instance of a configuration object + """ + prior_config_list = [config.to_dict() for config in prior_configs] + return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs) + + def to_dict(self): + # Override the default to_dict to apply to_dict to the list of prior configs. + result = super().to_dict() + result["prior_config_list"] = [config.to_dict() for config in result.pop("prior_configs")] + return result + + +__all__ = ["JukeboxConfig", "JukeboxPriorConfig", "JukeboxVQVAEConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py new file mode 100644 index 0000000000000000000000000000000000000000..f928d49cf5f73156c1352e1a5cb8695a57a08c1c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py @@ -0,0 +1,2670 @@ +# coding=utf-8 +# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Jukebox model.""" + +import math +import os +from typing import Optional + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import LayerNorm as FusedLayerNorm + +from ....activations import ACT2FN +from ....modeling_utils import PreTrainedModel +from ....utils import add_start_docstrings, logging +from ....utils.logging import tqdm +from .configuration_jukebox import ATTENTION_PATTERNS, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig + + +logger = logging.get_logger(__name__) + + +def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")): + """ + Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + + Args: + logits (`torch.Tensor`): + logits distribution shape (vocabulary size) + top_k (`int`, *optional*, defaults to 0): + When `top_k >0` keep only top key tokens with highest probability (top-k filtering). + top_p (`int`, *optional*, defaults to 0): + When `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering). + """ + logits = logits.clone() + top_k = min(top_k, logits.size(-1)) # Safety check + + if top_k > 0: + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k, dim=-1)[0][..., -1:] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probs > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # indices_to_remove = sorted_indices[sorted_indices_to_remove] + indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_( + dim=-1, index=sorted_indices, src=sorted_indices_to_remove + ) + logits[indices_to_remove] = filter_value + return logits + + +def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration): + """ + Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be + returned. If the provided token sequence is smaller, it will be padded, otherwise, only characters ranging from the + midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on + the most relevant tokens (in time) for the sequence. + + Args: + full_tokens (`list[int]`): + List containing the token ids of the entire lyrics. + total_length (`int`): + Total expected length of the music (not all of it is generated, see duration), in samples. + offset (`int`): + Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into + account + duration (`int`): + Expected duration of the generated music, in samples. The duration has to be smaller than the total length, + which represent the overall length of the signal, + """ + full_tokens = full_tokens[0] + if len(full_tokens) < max_n_lyric_tokens: + tokens = torch.cat( + [torch.zeros(max_n_lyric_tokens - len(full_tokens), dtype=torch.long).to(full_tokens.device), full_tokens] + ) + indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens))) + else: + midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length) + midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2) + tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2] + indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2)) + return tokens.unsqueeze(dim=0), indices + + +# Break total_length into hops/windows of size n_ctx separated by hop_length +def get_starts(total_length, n_ctx, hop_length): + starts = [] + for start in range(0, total_length - n_ctx + hop_length, hop_length): + if start + n_ctx >= total_length: + # Last hop could be smaller, we make it n_ctx to maximise context + start = total_length - n_ctx + starts.append(start) + return starts + + +def get_alignment(music_tokens, labels, prior, config): + level = prior.levels - 1 # Top level used + n_ctx = prior.n_ctx + tokens = music_tokens[level] + batch_size, total_length = tokens.shape[0], tokens.shape[1] + if total_length < n_ctx: + padding_length = n_ctx - total_length + tokens = torch.cat( + [tokens, torch.zeros(batch_size, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1 + ) + total_length = tokens.shape[1] + else: + padding_length = 0 + + hop_length = int(config.hop_fraction[-level - 1] * prior.n_ctx) + alignment_head, alignment_layer = config.prior_alignment_head[0], config.prior_alignment_layer[0] + attn_layers = {alignment_layer} + alignment_hops = {} + indices_hops = {} + for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc="Computing lyric to music alignment "): + end = start + n_ctx + # set metadata offset, sample_length and lyrics tokens + metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0) + tokens_bs = torch.chunk(tokens, batch_size, dim=0) + metadata_bs = torch.chunk(metadata, batch_size, dim=0) + w_hops = [] + for tokens_i, metadata_i in zip(tokens_bs, metadata_bs): + w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers) + w_hops.append(w_hop[0][:, alignment_head]) + del w_hop + weights = torch.cat(w_hops, dim=0) + del w_hops + alignment_hop = weights.to(device="cpu", dtype=torch.float).numpy() + del weights + + # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens) + # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens + indices_hops[start] = indices_hop + alignment_hops[start] = alignment_hop + + # Combine attn for each hop into attn for full range + # Use indices to place them into correct place for corresponding source tokens + alignments = [] + for item in range(batch_size): + # Note each item has different length lyrics + full_tokens = labels[0, 3:] + alignment = np.zeros((total_length, len(full_tokens) + 1)) + for start in reversed(get_starts(total_length, n_ctx, hop_length)): + end = start + n_ctx + alignment_hop = alignment_hops[start][item] + indices = indices_hops[start][item] + alignment[start:end, indices] = alignment_hop + alignment = alignment[: total_length - padding_length, :-1] # remove token padding, and last lyric index + alignments.append(alignment) + return alignments + + +def save_temp_audio(fname, lvl, metas, aud): + aud = torch.clamp(aud, -1, 1).cpu().numpy() + for i in list(range(aud.shape[0])): + if metas is not None: + artists, genres, lyrics = list(metas)[i].values() + path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}" + np.save(path, aud[i]) + else: + np.save(f"{fname}/lvl_{lvl}-sample-{i}", aud[i]) + + +def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t): + # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed. + if mask is None or query_length == 1: + return None + offset = sample_t - query_length if sample else max(key_value_length - query_length, 0) + if mask == "autoregressive": + # Masked dense + mask = torch.ones(query_length, key_value_length, device=device).tril(offset) + elif mask == "summary": + # Masked summary + mask = torch.ones(query_length, query_length, device=device).tril() + mask = torch.ones(query_length, query_length, device=device).tril() + mask = mask.view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :] + mask = ( + torch.nn.functional.pad( + mask, + (0, 0, 1, 0), + value=1, + ) + .contiguous() + .view(query_length, key_value_length) + ) + elif mask == "prime": + mask = torch.ones(query_length, key_value_length, device=device).tril(offset) + return mask.view(1, 1, query_length, key_value_length) + + +class JukeboxConv1D(nn.Module): + def __init__(self, input_width, output_width): + super().__init__() + self.input_width = input_width + self.output_width = output_width + weight = torch.empty(input_width, output_width) + bias = torch.zeros(output_width) + self.weight = nn.Parameter(weight) + self.bias = nn.Parameter(bias) + + def forward(self, hidden_states): + size_out = (*hidden_states.size()[:-1], self.output_width) + hidden_states = torch.addmm( + self.bias.type_as(hidden_states), + hidden_states.view(-1, hidden_states.size(-1)), + self.weight.type_as(hidden_states), + ) + hidden_states = hidden_states.view(*size_out) + return hidden_states + + +class JukeboxResConv1DBlock(nn.Module): + def __init__(self, config, conv_width, depth=1, res_scale=1.0): + super().__init__() + hidden_dim = config.res_convolution_multiplier * conv_width + dilation = config.res_dilation_growth_rate**depth + padding = dilation + + self.res_scale = res_scale + self.activation = nn.ReLU() + self.conv1d_1 = nn.Conv1d(conv_width, hidden_dim, 3, 1, padding, dilation) + self.conv1d_2 = nn.Conv1d(hidden_dim, conv_width, 1, 1, 0) + + def forward(self, hidden_states): + residuals = hidden_states + hidden_states = self.activation(hidden_states) + hidden_states = self.conv1d_1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.conv1d_2(hidden_states) + return residuals + self.res_scale * hidden_states + + +class JukeboxResnet1D(nn.Module): + def __init__(self, config, conv_width, n_depth, reverse_dilation=False): + super().__init__() + self.dilation_cycle = config.res_dilation_cycle + res_scale = 1.0 if not config.conv_res_scale else 1.0 / math.sqrt(n_depth) + + blocks = [] + for depth in range(n_depth): + block_depth = depth if self.dilation_cycle is None else depth % self.dilation_cycle + blocks.append(JukeboxResConv1DBlock(config, conv_width, block_depth, res_scale)) + + if reverse_dilation: + blocks = blocks[::-1] + self.resnet_block = nn.ModuleList(blocks) + + def forward(self, hidden_states): + for block in self.resnet_block: + hidden_states = block(hidden_states) + return hidden_states + + +class JukeboxEncoderConvBlock(nn.Module): + def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t): + super().__init__() + blocks = [] + filter_t = stride_t * 2 + pad_t = stride_t // 2 + if down_t > 0: + for i in range(down_t): + blocks.append(nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, filter_t, stride_t, pad_t)) + blocks.append(JukeboxResnet1D(config, hidden_dim, depth)) + self.proj_out = nn.Conv1d(hidden_dim, config.embed_dim, 3, 1, 1) + self.downsample_block = nn.ModuleList(blocks) + + def forward(self, hidden_states): + for block in self.downsample_block: + hidden_states = block(hidden_states) + hidden_states = self.proj_out(hidden_states) + return hidden_states + + +class JukeboxEncoder(nn.Module): + def __init__(self, config, width, depth, levels, downs_t, strides_t): + super().__init__() + self.levels = levels + self.level_blocks = nn.ModuleList() + + iterator = zip(list(range(self.levels)), downs_t, strides_t) + for i, down_t, stride_t in iterator: + self.level_blocks.append( + JukeboxEncoderConvBlock( + config, config.conv_input_shape if i == 0 else config.embed_dim, width, depth, down_t, stride_t + ) + ) + + def forward(self, hidden_states): + all_hidden_states = [] + + # 64, 32, ... + for level in range(self.levels): + level_block = self.level_blocks[level] + hidden_states = level_block(hidden_states) + all_hidden_states.append(hidden_states) + + return all_hidden_states + + +class JukeboxDecoderConvBock(nn.Module): + def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t, reverse_dilation=True): + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + super().__init__() + blocks = [] + if down_t > 0: + filter_t = stride_t * 2 + pad_t = stride_t // 2 + self.proj_in = nn.Conv1d(embed_dim, hidden_dim, 3, 1, 1) + for i in range(down_t): + blocks.append(JukeboxResnet1D(config, hidden_dim, depth, reverse_dilation)) + blocks.append( + nn.ConvTranspose1d( + hidden_dim, hidden_dim if i < down_t - 1 else embed_dim, filter_t, stride_t, pad_t + ) + ) + self.upsample_block = nn.ModuleList(blocks) + + def forward(self, hidden_states): + hidden_states = self.proj_in(hidden_states) + for block in self.upsample_block: + hidden_states = block(hidden_states) + return hidden_states + + +class JukeboxDecoder(nn.Module): + def __init__(self, config, hidden_dim, depth, levels, downs_t, strides_t): + super().__init__() + self.levels = levels + self.level_blocks = nn.ModuleList() + for level, down_t, stride_t in zip(list(range(self.levels)), downs_t, strides_t): + self.level_blocks.append( + JukeboxDecoderConvBock(config, config.embed_dim, hidden_dim, depth, down_t, stride_t) + ) + + self.out = nn.Conv1d(config.embed_dim, config.conv_input_shape, 3, 1, 1) + + def forward(self, hidden_states, all_levels=True): + hidden_state = hidden_states[-1] + + # 32, 64 ... + for level in reversed(range(self.levels)): + level_block = self.level_blocks[level] + hidden_state = level_block(hidden_state) + + if level != 0 and all_levels: + hidden_state = hidden_state + hidden_states[level - 1] + + hidden_state = self.out(hidden_state) + return hidden_state + + +class JukeboxBottleneckBlock(nn.Module): + def __init__(self, config: JukeboxVQVAEConfig): + super().__init__() + self.nb_discrete_codes = config.nb_discrete_codes + self.codebook_width = config.embed_dim + self.mu = config.lmu + self.threshold = 1.0 + self.init = False + self.codebook_sum = None + self.codebook_elem = None + self.register_buffer("codebook", torch.zeros(self.nb_discrete_codes, self.codebook_width)) + + def _tile(self, hidden_states): + dim, embed_width = hidden_states.shape + if dim < self.nb_discrete_codes: + n_repeats = (self.nb_discrete_codes + dim - 1) // dim + std = 0.01 / np.sqrt(embed_width) + hidden_states = hidden_states.repeat(n_repeats, 1) + hidden_states = hidden_states + torch.randn_like(hidden_states) * std + return hidden_states + + def init_codebook(self, hidden_states): + nb_discrete_codes = self.nb_discrete_codes + self.init = True + codes = self._tile(hidden_states) + self.codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes] + self.codebook_sum = self.codebook + self.codebook_elem = torch.ones(nb_discrete_codes, device=self.codebook.device) + + def update_codebook(self, hidden_states, latent_states): + mu, codebook_width, nb_discrete_codes = self.mu, self.codebook_width, self.nb_discrete_codes + with torch.no_grad(): + # Calculate new centres + # nb_discrete_codes, batch_size * seq_length + latent_states_onehot = torch.zeros(nb_discrete_codes, hidden_states.shape[0], device=hidden_states.device) + latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1) + + _codebook_sum = torch.matmul(latent_states_onehot, hidden_states) + _codebook_elem = latent_states_onehot.sum(dim=-1) # nb_discrete_codes + codes = self._tile(hidden_states) + _random_codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes] + + # Update centres + old_codebook = self.codebook + self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum + self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem # nb_discrete_codes + usage = (self.codebook_elem.view(nb_discrete_codes, 1) >= self.threshold).float() + + norm_code = self.codebook_sum.view(nb_discrete_codes, codebook_width) / self.codebook_elem.view( + nb_discrete_codes, 1 + ) + self.codebook = usage * (norm_code) + (1 - usage) * _random_codebook + _codebook_prob = _codebook_elem / torch.sum(_codebook_elem) # prob of each bin + entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8)) # entropy ie how diverse + used_curr = (_codebook_elem >= self.threshold).sum() + usage = torch.sum(usage) + dk = torch.linalg.norm(self.codebook - old_codebook) / np.sqrt(np.prod(old_codebook.shape)) + return {"entropy": entropy, "used_curr": used_curr, "usage": usage, "dk": dk} + + def preprocess(self, hidden_states): + hidden_states = hidden_states.permute(0, 2, 1).contiguous() + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + if hidden_states.shape[-1] == self.codebook_width: + prenorm = torch.linalg.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt( + np.prod(hidden_states.shape) + ) + elif hidden_states.shape[-1] == 2 * self.codebook_width: + x1, x2 = hidden_states[..., : self.codebook_width], hidden_states[..., self.codebook_width :] + prenorm = (torch.linalg.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + ( + torch.linalg.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape)) + ) + + # Normalise + hidden_states = x1 + x2 + + return hidden_states, prenorm + + def postprocess(self, latent_states, dequantised_states, x_shape): + batch_size, time = x_shape + dequantised_states = dequantised_states.view(batch_size, time, -1).permute(0, 2, 1).contiguous() + latent_states = latent_states.view(batch_size, time) + return latent_states, dequantised_states + + def quantise(self, latent_states): + # Calculate latent code latent_states + codebook_weights = self.codebook.t() + distance = ( + torch.sum(latent_states**2, dim=-1, keepdim=True) + - 2 * torch.matmul(latent_states, codebook_weights) + + torch.sum(codebook_weights**2, dim=0, keepdim=True) + ) # (batch_size * latent_states , codebook_weights) + min_distance, music_tokens = torch.min(distance, dim=-1) + fit = torch.mean(min_distance) + return music_tokens, fit + + def dequantise(self, music_tokens): + dequantised_states = F.embedding(music_tokens, self.codebook) + return dequantised_states + + def encode(self, latent_states): + samples, _, seq_len = latent_states.shape + + # Preprocess. + latent_states, _ = self.preprocess(latent_states) + + # Quantise + music_tokens, _ = self.quantise(latent_states) + + # Postprocess. + music_tokens = music_tokens.view(samples, seq_len) + return music_tokens + + def decode(self, music_tokens): + samples, seq_len = music_tokens.shape + + # Dequantise + dequantised_states = self.dequantise(music_tokens) + + # Postprocess + dequantised_states = ( + dequantised_states.view(samples, seq_len, self.codebook_width).permute(0, 2, 1).contiguous() + ) + return dequantised_states + + def forward(self, hidden_states, update_codebook=True): + samples, _, seq_len = hidden_states.shape + + # Preprocess + hidden_states, prenorm = self.preprocess(hidden_states) + + # Init codebook if not inited + if update_codebook and not self.init: + self.init_codebook(hidden_states) + + # Quantise and dequantise through bottleneck + music_tokens, fit = self.quantise(hidden_states) + dequantised_states = self.dequantise(music_tokens) + + # Update embeddings + if update_codebook: + update_metrics = self.update_codebook(hidden_states, music_tokens) + else: + update_metrics = {} + + # Loss + commit_loss = torch.linalg.norm(dequantised_states.detach() - hidden_states) ** 2 / np.prod( + hidden_states.shape + ) + + # Passthrough + dequantised_states = hidden_states + (dequantised_states - hidden_states).detach() + + # Postprocess + music_tokens, dequantised_states = self.postprocess(music_tokens, dequantised_states, (samples, seq_len)) + return music_tokens, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics) + + +class JukeboxBottleneck(nn.Module): + def __init__(self, config, levels): + super().__init__() + self.levels = levels + self.level_blocks = nn.ModuleList() + for level in range(self.levels): + self.level_blocks.append(JukeboxBottleneckBlock(config)) + + def encode(self, raw_audio): + music_tokens = [ + level_block.encode(hidden_states) for (level_block, hidden_states) in zip(self.level_blocks, raw_audio) + ] + return music_tokens + + def decode(self, music_tokens, start_level=0, end_level=None): + if end_level is None: + end_level = self.levels + quantised_audio = [ + level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], music_tokens) + ] + return quantised_audio + + def forward(self, input_audio): + music_tokens, quantised_states, commit_losses, metrics = [], [], [], [] + for level in range(self.levels): + level_block = self.level_blocks[-level - 1] + hidden_states = input_audio[level] + sampled_tokens, quantised_state, commit_loss, metric = level_block( + hidden_states, update_codebook=self.training + ) + music_tokens.append(sampled_tokens) + if not self.training: + # Be extra paranoid and make sure the encoder weights can't + # change from straight-through estimator + quantised_state = quantised_state.detach() + quantised_states.append(quantised_state) + commit_losses.append(commit_loss) + if self.training: + metrics.append(metric) + return music_tokens, quantised_states, commit_losses, metrics + + +JUKEBOX_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config (`JukeboxConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """The Hierarchical VQ-VAE model used in Jukebox. This model follows the Hierarchical VQVAE paper from [Will Williams, Sam +Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://huggingface.co/papers/2002.08111). + + """, + JUKEBOX_START_DOCSTRING, +) +class JukeboxVQVAE(PreTrainedModel): + config: JukeboxVQVAEConfig + base_model_prefix = "vqvae" + + def _init_weights(self, module): + if isinstance(module, nn.Embedding): # embed_tokens + module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale) + elif isinstance(module, JukeboxConv1D): + if self.config.zero_out: + module.weight.data.zero_() + else: + module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale) + elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out: + module.conv1d_2.weight.data.zero_() + module.conv1d_2.bias.data.zero_() + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def __init__(self, config: JukeboxVQVAEConfig): + super().__init__(config) + downs_t = config.res_downs_t + strides_t = config.res_strides_t + if not config.sample_length: + downsamples = [stride**down for stride, down in zip(strides_t, downs_t)] + top_raw_to_tokens = np.prod(downsamples) + config.sample_length = ( + config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens + ) * top_raw_to_tokens + config.sample_length = config.sample_length.astype(int) + + self.nb_discrete_codes = config.nb_discrete_codes + self.commit = config.commit + self.sample_length = config.sample_length + + self.downsamples = [stride**down for stride, down in zip(strides_t, downs_t)] + self.hop_lengths = np.cumprod(self.downsamples) + self.levels = levels = config.levels + self.music_tokens_shapes = [ + (int(self.sample_length // self.hop_lengths[-level - 1])) for level in range(levels) + ] + + self.multipliers = config.multipliers if config.multipliers is not None else [1] * levels + + self.encoders = nn.ModuleList() + self.decoders = nn.ModuleList() + for level in range(levels): + width = config.res_conv_width * self.multipliers[level] + depth = config.res_conv_depth * self.multipliers[level] + self.encoders.append( + JukeboxEncoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1]) + ) + self.decoders.append( + JukeboxDecoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1]) + ) + + self.bottleneck = JukeboxBottleneck(config, levels) + + def _decode(self, music_tokens, start_level=0, end_level=None): + # Decode + if end_level is None: + end_level = self.levels + latent_states = self.bottleneck.decode(music_tokens, start_level=start_level, end_level=end_level) + # Use only lowest level + decoder, dequantised_state = self.decoders[start_level], latent_states[0:1] + dequantised_state = decoder(dequantised_state, all_levels=False) + dequantised_state = dequantised_state.permute(0, 2, 1) + return dequantised_state + + def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1) -> torch.Tensor: + """ + Transforms the input `music_tokens` to their `raw_audio` representation. + + Args: + music_tokens (`torch.LongTensor`): + Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token + should be an index to a corresponding `code` vector in the codebook. + start_level (`int`, *optional*): + Level at which the decoding process will start. Default to 0. + end_level (`int`, *optional*): + Level at which the decoding process will start. Default to None. + bs_chunks (int, *optional*): + Number of chunks to process at the same time. + """ + token_chunks = [torch.chunk(token, bs_chunks, dim=0) for token in music_tokens] + dequantised_states = [] + for i in range(bs_chunks): + music_tokens_i = [chunks[i] for chunks in token_chunks] + dequantised_state = self._decode(music_tokens_i, start_level=start_level, end_level=end_level) + dequantised_states.append(dequantised_state) + return torch.cat(dequantised_states, dim=0) + + def _encode(self, raw_audio, start_level=0, end_level=None): + # Encode + if end_level is None: + end_level = self.levels + input_audio = raw_audio.permute(0, 2, 1).float() + latent_states = [] + for level in range(self.levels): + encoder = self.encoders[level] + latent_state = encoder(input_audio) + latent_states.append(latent_state[-1]) + music_tokens = self.bottleneck.encode(latent_states) + return music_tokens[start_level:end_level] + + def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1): + """ + Transforms the `input_audio` to a discrete representation made out of `music_tokens`. + + Args: + input_audio (`torch.Tensor`): + Raw audio which will be encoded to its discrete representation using the codebook. The closest `code` + form the codebook will be computed for each sequence of samples. + start_level (`int`, *optional*, defaults to 0): + Level at which the encoding process will start. Default to 0. + end_level (`int`, *optional*): + Level at which the encoding process will start. Default to None. + bs_chunks (int, *optional*, defaults to 1): + Number of chunks of raw audio to process at the same time. + """ + audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0) + music_tokens_list = [] + for chunk_i in audio_chunks: + music_tokens_i = self._encode(chunk_i, start_level=start_level, end_level=end_level) + music_tokens_list.append(music_tokens_i) + music_tokens = [torch.cat(music_tokens_level, dim=0) for music_tokens_level in zip(*music_tokens_list)] + return music_tokens + + def sample(self, n_samples): + music_tokens = [ + torch.randint(0, self.nb_discrete_codes, size=(n_samples, *music_tokens_shape), device="cpu") + for music_tokens_shape in self.music_tokens_shapes + ] + return self.decode(music_tokens) + + def forward(self, raw_audio: torch.FloatTensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Forward pass of the VQ-VAE, encodes the `raw_audio` to latent states, which are then decoded for each level. + The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is + computed. + + Args: + raw_audio (`torch.FloatTensor`): + Audio input which will be encoded and decoded. + + Returns: + `tuple[torch.Tensor, torch.Tensor]` + + + Example: + ```python + >>> from transformers import JukeboxVQVAE, set_seed + >>> import torch + + >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval() + >>> set_seed(0) + >>> zs = [torch.randint(100, (4, 1))] + >>> model.decode(zs).shape + torch.Size([4, 8, 1]) + ``` + """ + + # Encode/Decode + input_audio = raw_audio.permute(0, 2, 1).float() + latent_states = [] + for level in range(self.levels): + encoder = self.encoders[level] + latent_state = encoder(input_audio) + latent_states.append(latent_state[-1]) + + _, music_tokens, commit_losses, _ = self.bottleneck(latent_states) + dequantised_states = [] + for level in range(self.levels): + decoder = self.decoders[level] + dequantised_state = decoder(music_tokens[level : level + 1], all_levels=False) + dequantised_states.append(dequantised_state.permute(0, 2, 1)) + + commit_loss = sum(commit_losses) + loss = self.commit * commit_loss + + return dequantised_states, loss + + +class JukeboxMLP(nn.Module): + def __init__(self, config): + # a single channel is always used in original code + super().__init__() + embed_dim = config.hidden_size + hidden_dim = int(config.mlp_multiplier * embed_dim) + + self.c_fc = JukeboxConv1D(embed_dim, hidden_dim) + self.c_proj = JukeboxConv1D(hidden_dim, embed_dim) + self.act = ACT2FN[config.act_fn] + self.dropout = nn.Dropout(config.resid_dropout) + + def forward(self, hidden_states): + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class JukeboxLayerNorm(FusedLayerNorm): + def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True): + super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine) + self.width = np.prod(normalized_shape) + self.max_numel = 65535 * self.width + + def forward(self, input): + if input.numel() > self.max_numel: + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps).type_as(input) + else: + return super().forward(input).type_as(input) + + +class JukeboxAttention(nn.Module): + def __init__(self, config, n_ctx, attn_func="dense_attn"): + super().__init__() + self.embed_dim = config.hidden_size + self.n_heads = config.n_heads + self.dropout = config.attn_dropout + hidden_dim = int(config.attention_multiplier * self.embed_dim) + + self.head_dim = hidden_dim // config.n_heads + self.n_ctx = n_ctx + self.hidden_dim = hidden_dim + self.scale = self.head_dim**-0.25 + self.mask = config.mask + + if attn_func == "cross_attention": + self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim) + self.c_enc_kv = JukeboxConv1D(self.embed_dim, hidden_dim * 2) + else: + self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim * 3) + + self.c_proj = JukeboxConv1D(hidden_dim, self.embed_dim) + self.attn_dropout = nn.Dropout(config.attn_dropout) + self.resid_dropout = nn.Dropout(config.resid_dropout) + + # Sequence of length seq_len is factored as [blocks, seq_len // blocks] + self.attn_func = attn_func + if attn_func == "cross_attention": + self.qkv = self.decode_qkv + elif attn_func == "prime_attn": + self.qkv = self.prime_qkv + else: + self.qkv = self.factored_qkv + + ATTENTION_MAP = { + "dense_attn": (self.dense_attn, "autoregressive"), + "block_attn": (self.block_attn, "autoregressive"), + "transpose_block_attn": (self.transpose_block_attn, "autoregressive"), + "prev_block_attn": (self.prev_block_attn, None), + "summary_attn": (self.summary_attn, "summary"), + "summary_spread_attn": (self.summary_spread_attn, "summary"), + "cross_attention": (self.dense_attn, None), + "prime_attn": (self.prime_attn, "prime"), + } + self.attn, self.attn_mask = ATTENTION_MAP[attn_func] + + self.blocks = config.blocks + self.spread = config.spread + if self.blocks is not None: + self.block_ctx = self.n_ctx // self.blocks + + self.sample_t = 0 + self.cache = {} + self.encoder_len = config.nb_relevant_lyric_tokens # length of the encoder input ids + self.record_attn = False + + def _attn(self, query_states, key_states, value_states, sample): + scale = self.scale + if self.training: + attention_weight = torch.matmul(query_states * scale, key_states * scale) + else: + attention_weight = torch.matmul(query_states, key_states) + attention_weight.mul_(scale * scale) + attn_weight_type = attention_weight.dtype + attention_weight = attention_weight.float() + if self.mask: + # Generate appropriate mask to mask out all positions before current + # Might take up lot of memory for dense, so can cache it + mask = get_mask( + self.attn_mask, + query_states.size(-2), + key_states.size(-1), + self.blocks, + self.spread, + attention_weight.device, + sample, + self.sample_t, + ) + if mask is not None: + attention_weight = attention_weight * mask + -1e9 * (1 - mask) + attention_prob = F.softmax(attention_weight, dim=-1).type(attn_weight_type) + if self.record_attn: + self.attention_prob = attention_prob + if self.attn_func == "prime_attn": + # only keep music queries and lyrics keys/values + self.attention_prob = self.attention_prob[:, :, self.encoder_len :, : self.encoder_len] + attention_prob = self.attn_dropout(attention_prob) + context_states = torch.matmul(attention_prob, value_states) + return context_states + + def merge_heads(self, hidden_states): + hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous() + new_hidden_states_shape = (*hidden_states.size()[:-2], hidden_states.size(-2) * hidden_states.size(-1)) + return hidden_states.view(*new_hidden_states_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, hidden_states, is_key=False): + new_hidden_states_shape = ( + *hidden_states.size()[:-1], + self.n_heads, + hidden_states.size(-1) // self.n_heads, + ) + hidden_states = hidden_states.view(*new_hidden_states_shape) # in Tensorflow implem: fct split_states + if is_key: + return hidden_states.permute(0, 2, 3, 1) + else: + return hidden_states.permute(0, 2, 1, 3) + + def dense_attn(self, query, key, value, sample): + query = self.split_heads(query) + key = self.split_heads(key, is_key=True) + value = self.split_heads(value) + context_states = self._attn(query, key, value, sample) + context_states = self.merge_heads(context_states) + return context_states + + def block_attn(self, query, key, value, sample): + block_ctx = self.block_ctx + batch_size, seq_len, embed_dim = value.shape # For sample, query_len= 1, key_len = value_len = sample_t + if sample: + return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim) + else: + query_length = query.shape[1] + query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim) + if query_length < seq_len: + seq_len = query_length + key = key[:, -seq_len:].contiguous() + value = value[:, -seq_len:].contiguous() + key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim) + value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim) + return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim) + + def transpose_block_attn(self, query, key, value, sample): + block_ctx = self.block_ctx + batch_size, seq_len, embed_dim = value.shape # For sample, query_len= 1, key_len = value_len = sample_t + if sample: + block_len = (seq_len - 1) % block_ctx + key = key[:, block_len::block_ctx, :] + value = value[:, block_len::block_ctx, :] + return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim) + else: + query_length = query.shape[1] + query = query.view(batch_size, query_length // block_ctx, block_ctx, embed_dim) + query = query.transpose(1, 2).contiguous() + query = query.view(batch_size * block_ctx, query_length // block_ctx, embed_dim) + + key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim) + key = key.transpose(1, 2).contiguous() + key = key.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim) + + value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim) + value = value.transpose(1, 2).contiguous() + value = value.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim) + + block_attn = self.dense_attn(query, key, value, sample) + block_attn = block_attn.view(batch_size, block_ctx, query_length // block_ctx, embed_dim) + block_attn = block_attn.transpose(1, 2).contiguous() + block_attn = block_attn.view(batch_size, query_length, embed_dim) + + return block_attn + + def prev_block_attn(self, query, key, value, sample): + block_ctx = self.block_ctx + batch_size, seq_len, embed_dim = value.shape # For sample, query_len= 1, key_len = value_len = sample_t + if sample: + block = (seq_len - 1) // block_ctx + prev_l = (block - 1) * block_ctx + if block > 0: + key = key[:, prev_l : prev_l + block_ctx, :] + value = value[:, prev_l : prev_l + block_ctx, :] + else: + key = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype) + value = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype) + return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim) + else: + query_length = query.shape[1] + query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim) + + key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :] + key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0)) + key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim) + + value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :] + value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0)) + value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim) + + if query_length < seq_len: + nb_query_blocks = query_length // block_ctx + nb_key_blocks = seq_len // block_ctx + seq_len = query_length + key = key.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:] + key = key.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim) + + value = value.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:] + value = value.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim) + + return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim) + + def summary_attn(self, query, key, value, sample): + blocks = self.blocks + block_ctx = self.block_ctx + batch_size, seq_len, embed_dim = value.shape # For sample, query_len= 1, key_len = value_len = sample_t + if sample: + key = key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :] + key = torch.nn.functional.pad(key, (0, 0, 1, 0)) + + value = value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :] + value = torch.nn.functional.pad(value, (0, 0, 1, 0)) + return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim) + else: + key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :] + key = torch.nn.functional.pad(key, (0, 0, 1, 0)) # batch_size, blocks, embed_dim + + value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :] + value = torch.nn.functional.pad(value, (0, 0, 1, 0)) # batch_size, blocks, embed_dim + return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim) + + def summary_spread_attn(self, query, key, value, sample): + blocks = self.blocks + spread = self.spread + + batch_size, seq_len, embed_dim = value.shape # For sample, query_len= 1, key_len = value_len = sample_t + if sample: + raise NotImplementedError + else: + key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :] + key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0)).contiguous() + key = key.view(batch_size, blocks * spread, embed_dim) + + value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :] + value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0)).contiguous() + value = value.view(batch_size, blocks * spread, embed_dim) + + return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim) + + def prime_attn(self, query, key, value, sample): + encoder_len = self._encoder_len + key = key[:, :encoder_len] + value = value[:, :encoder_len] + return self.dense_attn(query, key, value, sample) + + def factored_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False): + curr_ctx = hidden_states.shape[1] + if last_encoder_hidden_states is not None: + raise TypeError("last_encoder_hidden_states should be None") + + query, key, value = hidden_states.chunk(3, dim=2) + if sample: + self.sample_t += curr_ctx + key, value = self._append_cache(key, value) + l_cache = self._suff_cache_len() + if self._cache_len() > l_cache: + self._slice_cache(-l_cache) + if curr_ctx > 1: + if self.attn_func != "dense_attn": + query = self._pad_to_block_ctx(query, query=True) + key = self._pad_to_block_ctx(key) + value = self._pad_to_block_ctx(value) + sample = False + else: + key = self.cache["key"] + value = self.cache["value"] + return query, key, value, sample + + def prime_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False): + curr_ctx = hidden_states.shape[1] + if last_encoder_hidden_states is not None: + raise TypeError("last_encoder_hidden_states should be None") + query, key, value = hidden_states.chunk(3, dim=2) + if sample: + if self._cache_len() < self._encoder_len: + self._append_cache(key, value) + if self._cache_len() > self._encoder_len: + self._slice_cache(0, self._encoder_len) + key, value = self.cache["key"], self.cache["value"] + self.sample_t += curr_ctx + return query, key, value, sample + + def decode_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False): + curr_ctx = hidden_states.shape[1] + query = hidden_states + if sample: + if self.sample_t == 0: + self.cache["key"], self.cache["value"] = self.c_enc_kv( + last_encoder_hidden_states.type_as(hidden_states) + ).chunk(2, dim=2) + key, value = self.cache["key"], self.cache["value"] + self.sample_t += curr_ctx + else: + key, value = self.c_enc_kv(last_encoder_hidden_states.type_as(hidden_states)).chunk(2, dim=2) + return query, key, value, sample + + def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False): + curr_ctx = hidden_states.shape[1] + hidden_states = self.c_attn(hidden_states) + query, key, value, sample = self.qkv( + hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample + ) + attention_scores = self.attn(query, key, value, sample) + if attention_scores.shape[1] != curr_ctx: + offset = self._offset(curr_ctx) + attention_scores = attention_scores[:, offset : offset + curr_ctx, :].contiguous() + attention_scores = self.c_proj(attention_scores) + return self.resid_dropout(attention_scores) + + @property + def _encoder_len(self): + encoder_len = self.encoder_len + encoder_blocks = (encoder_len // self.blocks) + 1 + return encoder_blocks * self.blocks + + def _offset(self, curr_ctx): + if self.attn_func == "dense_attn": + return 0 + return (self.sample_t - curr_ctx) % self.block_ctx + + def _pad_to_block_ctx(self, hidden_states, query=False): + seq_len = hidden_states.shape[1] + offset = self._offset(seq_len) if query else 0 + n_blocks = (seq_len + offset + self.block_ctx - 1) // self.block_ctx + pad = n_blocks * self.block_ctx - seq_len - offset + if pad == 0 and offset == 0: + return hidden_states + else: + return F.pad(hidden_states, (0, 0, offset, pad)) + + def _cache_len(self): + return 0 if "key" not in self.cache else self.cache["key"].shape[1] + + def _suff_cache_len(self): + """ + Precondition: + key and value are appended with the current context and self.sample_t reflects the 1-indexed sample + location in the context. + """ + previous_block_length = (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx + REQUIRED_CACHE_LEN = { + "dense_attn": self.sample_t, + "block_attn": (self.sample_t - 1) % self.block_ctx + 1, + "transpose_block_attn": self.sample_t, + "prev_block_attn": self.sample_t if self.sample_t <= self.block_ctx else previous_block_length, + "cross_attn": self.encoder_len, + "prime_attn": min(self.sample_t, self._encoder_len), + } + + return REQUIRED_CACHE_LEN[self.attn_func] + + def _slice_cache(self, start, end=None): + self.cache["key"] = self.cache["key"][:, start:end] + self.cache["value"] = self.cache["value"][:, start:end] + + def _append_cache(self, key, value): + if "key" not in self.cache: + self.cache["key"] = key + self.cache["value"] = value + else: + old_key, old_value = key, value + key = torch.cat([self.cache["key"], old_key], dim=1) + value = torch.cat([self.cache["value"], old_value], dim=1) + del self.cache["key"] + del self.cache["value"] + del old_key + del old_value + self.cache["key"] = key + self.cache["value"] = value + return self.cache["key"], self.cache["value"] + + def del_cache(self): + self.sample_t = 0 + if "key" in self.cache: + del self.cache["key"] + if "value" in self.cache: + del self.cache["value"] + self.cache = {} + + +class JukeboxBlock(nn.Module): + def __init__(self, config, n_ctx, attn_func="dense_attn"): + super().__init__() + self.width = config.hidden_size + self.attn = JukeboxAttention(config, n_ctx, attn_func=attn_func) + + self.layer_norm_0 = JukeboxLayerNorm(config.hidden_size) + self.mlp = JukeboxMLP(config) + self.layer_norm_1 = JukeboxLayerNorm(config.hidden_size) + self.res_scale = 1.0 / config.num_layers if config.attn_res_scale else 1.0 + self.attn_func = attn_func + + def forward(self, hidden_states, last_encoder_hidden_states, sample=False): + residuals = hidden_states + hidden_states = self.layer_norm_0(hidden_states) + hidden_states = self.attn(hidden_states, last_encoder_hidden_states, sample) + + output_states = self.layer_norm_1(residuals + hidden_states) + output_states = self.mlp(output_states) + if self.res_scale == 1.0: + output = residuals + hidden_states + output_states + else: + output = residuals + self.res_scale * (hidden_states + output_states) + return output + + +class JukeboxLayerStack(nn.Module): + def __init__(self, config, n_ctx): + super().__init__() + self.n_ctx = n_ctx + self.width = config.hidden_size + self.num_layers = config.num_layers + self.blocks = config.blocks + self.attention_pattern = config.attention_pattern + if self.blocks is not None: + self.block_ctx = n_ctx // self.blocks + self.encoder_len = config.nb_relevant_lyric_tokens + self.n_heads = config.n_heads + + # Orders of attn_func + attention_pattern = ATTENTION_PATTERNS[self.attention_pattern] + self._attn_mods = nn.ModuleList() + for depth in range(self.num_layers): + self._attn_mods.append(JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth))) + + self.saved_attn_weights = [] + + def set_record_attn(self, record_attn): + """ + Makes forward prop dump self-attention softmaxes to self.saved_attn_weights. + + Args: + record_attn (`Union[bool,set]`): + Either a set of layer indices indicating which layers to store, or a boolean value indicating Whether + to dump all. + """ + + def _should_record_attn(layer_idx): + if isinstance(record_attn, bool): + return record_attn + return layer_idx in record_attn + + for i, layer in enumerate(self._attn_mods): + layer.attn.record_attn = _should_record_attn(i) + + if not record_attn: + self.saved_attn_weights = [] + + def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False): + # Blocks + for i, attn_layer in enumerate(self._attn_mods): + if attn_layer.attn_func == "cross_attention": # attend to the lyrics + hidden_states = attn_layer( + hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample + ) + else: + hidden_states = attn_layer(hidden_states, last_encoder_hidden_states=None, sample=sample) + if attn_layer.attn.record_attn: + self.saved_attn_weights.append(attn_layer.attn.c_attn.weight) + return hidden_states + + def del_cache(self): + for attn_layer in self._attn_mods: + attn_layer.attn.del_cache() + + +class JukeboxPositionalEmbedding(nn.Module): + def __init__(self, embed_dim, width): + super().__init__() + self.pos_emb = nn.Parameter(torch.empty((embed_dim, width))) + + def forward(self): + pos_emb = self.pos_emb + return pos_emb + + +class JukeboxConditionalAutoregressive(nn.Module): + def __init__( + self, + config, + n_ctx=None, + embed_dim=None, + audio_conditioning=False, + metadata_conditioning=False, + is_encoder=False, + ): + """ + Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly + set for each configuration. + + Args: + config (`JukeboxPriorConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does + not load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. + n_ctx (`int`, *optional*): + Number of tokens or lyrics tokens provided in a single pass. + embed_dim (`int`, *optional*): + Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension, + if the model combines lyrics and music tokens, or simply n_vocab if the model is a separate encoder + audio_conditioning (`bool`, *optional*, defaults to `False`): + Whether or not the prior supports conditioning on audio. + metadata_conditioning (`bool`, *optional*, defaults to `False`): + Whether or not the prior supports conditioning on artitst, genres, lyrics and timing. + is_encoder (`bool`, *optional*, defaults to `False`): + Whether the model is an encoder only model. + """ + + super().__init__() + self.width = config.hidden_size + self.num_layers = config.num_layers + self.n_ctx = n_ctx if n_ctx is not None else config.n_ctx + self.embed_dim = embed_dim if embed_dim is not None else config.music_vocab_size + self.embed_tokens = nn.Embedding(self.embed_dim, config.hidden_size) + self.embed_tokens_dropout = nn.Dropout(config.emb_dropout) + self.metadata_conditioning = metadata_conditioning + self.audio_conditioning = audio_conditioning + if not metadata_conditioning: + self.start_token = nn.Parameter(torch.empty((1, config.hidden_size))) + self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.hidden_size) + self.pos_emb_dropout = nn.Dropout(config.emb_dropout) + + self.transformer = JukeboxLayerStack(config, n_ctx=self.n_ctx) + self.is_encoder = is_encoder + self.encoder_len = config.nb_relevant_lyric_tokens + + if config.merged_decoder: + # Merged piped model uses this setup + self.add_cond_after_transformer = False + self.share_embed_tokens_fc_proj_out = False + else: + self.add_cond_after_transformer = True + self.share_embed_tokens_fc_proj_out = True + + if not is_encoder: + self.fc_proj_out = nn.Linear(config.hidden_size, self.embed_dim, bias=False) + if self.share_embed_tokens_fc_proj_out: + self.fc_proj_out.weight = self.embed_tokens.weight + self.loss = torch.nn.CrossEntropyLoss() + + def forward( + self, + tokens, + audio_conditioning=None, + metadata_conditioning=None, + last_encoder_hidden_states=None, + get_preds=False, + get_acts=False, + get_sep_loss=False, + ): + """ + Args: + tokens (`torch.tensor`): + Can represent music tokens, lyrics tokens or both, depending on the configuration. + """ + # Preprocess. + batch_size = tokens.shape[0] + with torch.no_grad(): + tokens = tokens.view(batch_size, -1).long() + + if not self.audio_conditioning: + audio_conditioning = torch.zeros( + (batch_size, 1, self.width), + device=tokens.device, + dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype, + ) + + target = tokens # Target + hidden_states = self.embed_tokens(tokens) + # Shift by 1, and fill in start token + hidden_states = torch.cat((hidden_states[:, -1:], hidden_states[:, :-1]), dim=1) + if self.metadata_conditioning: + hidden_states[:, 0] = metadata_conditioning.view(batch_size, self.width) + else: + hidden_states[:, 0] = self.start_token + + hidden_states = ( + self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning + ) # Pos emb and dropout + + hidden_states = self.transformer( + hidden_states, last_encoder_hidden_states=last_encoder_hidden_states + ) # Transformer + if self.add_cond_after_transformer: # Piped doesn't add x_cond + hidden_states = hidden_states + audio_conditioning + + activations = hidden_states + if self.is_encoder: + return hidden_states + + hidden_states = self.fc_proj_out(hidden_states) # Predictions + loss_fn = nn.CrossEntropyLoss() + if get_sep_loss: + lyric_hidden_states = hidden_states[:, : self.encoder_len].reshape(-1, self.embed_dim) + token_hidden_states = hidden_states[:, self.encoder_len :].reshape(-1, self.embed_dim) + + lyric_loss = loss_fn(lyric_hidden_states, target[:, : self.encoder_len].reshape(-1)) / np.log(2.0) + music_token_loss = loss_fn(token_hidden_states, target[:, self.encoder_len :].reshape(-1)) / np.log(2.0) + + loss = (lyric_loss, music_token_loss) # Note order! Lyric is first + else: + loss = loss_fn(hidden_states.view(-1, self.embed_dim), target.view(-1)) / np.log(2.0) # Loss + + if get_preds: + return loss, hidden_states + elif get_acts: + return loss, activations + else: + return loss, None + + def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning): + if sample_t == 0: + hidden_states = torch.empty(n_samples, 1, self.width, dtype=self.embed_tokens.weight.dtype).to( + self.embed_tokens.weight.device + ) + if self.metadata_conditioning: + hidden_states[:, 0] = metadata_conditioning.view(n_samples, self.width) + else: + hidden_states[:, 0] = self.start_token + else: + hidden_states = self.embed_tokens(tokens) + if audio_conditioning.shape == (n_samples, self.n_ctx, self.width): + cond = audio_conditioning[:, sample_t : sample_t + 1, :] + else: + cond = audio_conditioning + # Pos emb, dropout is identity at eval time + hidden_states = hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond + return hidden_states, cond + + def sample( + self, + n_samples, + audio_conditioning=None, + metadata_conditioning=None, + last_encoder_hidden_states=None, + temp=1.0, + top_k=0, + top_p=0.0, + get_preds=False, + sample_tokens=None, + ): + if sample_tokens is None: + sample_tokens = self.n_ctx + + if not self.audio_conditioning: + audio_conditioning = torch.zeros( + (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype + ).to(self.fc_proj_out.device) + + with torch.no_grad(): + sampled_tokens = [] + tokens = None + if get_preds: + preds = [] + + iter = tqdm(range(0, sample_tokens), leave=False) + for sample_t in iter: + iter.set_description(f"Ancestral sampling {sample_tokens} music tokens", refresh=True) + hidden_states, cond = self.get_emb( + sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning + ) + + hidden_states = self.transformer( + hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True + ) + if self.add_cond_after_transformer: + hidden_states = hidden_states + cond + hidden_states = self.fc_proj_out(hidden_states) # Predictions + if get_preds: + preds.append(hidden_states.clone()) + # Adjust logits + hidden_states = hidden_states / temp + hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p) + # Sample and replace hidden_states + tokens = torch.distributions.Categorical(logits=hidden_states).sample() + sampled_tokens.append(tokens.clone()) + + del tokens + self.transformer.del_cache() + + tokens = torch.cat(sampled_tokens, dim=1) + if get_preds: + preds = torch.cat(preds, dim=1) + if get_preds: + return tokens, preds + else: + return tokens + + def split_chunks(self, length, chunk_size): + n_passes = (length + chunk_size - 1) // chunk_size + chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1] + return chunk_sizes + + def primed_sample( + self, + n_samples, + lyric_and_music_tokens, + audio_conditioning=None, + metadata_conditioning=None, + last_encoder_hidden_states=None, + temp=1.0, + top_k=0, + top_p=0.0, + get_preds=False, + chunk_size=None, + sample_tokens=None, + ): + if sample_tokens is None: + sample_tokens = self.n_ctx + # Preprocess. + batch_size = lyric_and_music_tokens.shape[0] + with torch.no_grad(): + lyric_and_music_tokens = lyric_and_music_tokens.view(batch_size, -1).long() + + sampled_audio = torch.split(lyric_and_music_tokens, 1, dim=1) + sampled_audio = list(sampled_audio) + + if not self.audio_conditioning: + audio_conditioning = torch.zeros( + (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype + ).to(lyric_and_music_tokens.device) + + with torch.no_grad(): + if get_preds: + preds = [] + + # Fill up key/value cache for past context by running forward pass. + # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage. + if chunk_size is None: + chunk_size = len(sampled_audio) + chunk_sizes = self.split_chunks(len(sampled_audio), chunk_size) + x_primes = [] + start = 0 + token = None + + for current_chunk_size in tqdm(chunk_sizes, desc="Preparing past key value", leave=False): + sampled_audio_prime, conds_prime = [], [] + for sample_t in range(start, start + current_chunk_size): + x_prime, cond_prime = self.get_emb( + sample_t, n_samples, token, audio_conditioning, metadata_conditioning + ) + token = sampled_audio[sample_t] + sampled_audio_prime.append(x_prime) + conds_prime.append(cond_prime) + start = start + current_chunk_size + x_prime, cond_prime = torch.cat(sampled_audio_prime, dim=1), torch.cat(conds_prime, dim=1) + del sampled_audio_prime + del conds_prime + if not get_preds: + del cond_prime + x_prime = self.transformer(x_prime, last_encoder_hidden_states=last_encoder_hidden_states, sample=True) + + if get_preds: + if self.add_cond_after_transformer: + x_prime = x_prime + cond_prime + del cond_prime + x_primes.append(x_prime) + else: + del x_prime + + if get_preds: + x_prime = torch.cat(x_primes, dim=1) + x_prime = self.fc_proj_out(x_prime) # Predictions + preds.append(x_prime) + + # the input of the encoder and decoder can be merged into (lyrics, music tokens) + input_tokens = sampled_audio[-1] + + itererator = tqdm( + range(len(sampled_audio), sample_tokens), + desc=f"Sampling {len(range(len(sampled_audio), sample_tokens))} music tokens", + leave=False, + ) + for sample_t in itererator: + hidden_states, cond = self.get_emb( + sample_t, n_samples, input_tokens, audio_conditioning, metadata_conditioning + ) + + hidden_states = self.transformer( + hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True + ) + if self.add_cond_after_transformer: + hidden_states = hidden_states + cond + hidden_states = self.fc_proj_out(hidden_states) # Predictions + if get_preds: + preds.append(hidden_states) + # Adjust logits + hidden_states = hidden_states / temp + hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p) + # only music tokens are sampled + music_tokens = torch.distributions.Categorical(logits=hidden_states).sample() + sampled_audio.append(music_tokens.clone()) + input_tokens = music_tokens + + del input_tokens, music_tokens + self.transformer.del_cache() + + music_tokens = torch.cat(sampled_audio, dim=1) + if get_preds: + preds = torch.cat(preds, dim=1) + if get_preds: + return music_tokens, preds + else: + return music_tokens + + +class JukeboxMusicTokenConditioner(nn.Module): + """ + The `JukeboxMusicTokenConditioner` takes music tokens as an input (corresponding to the codes of the VQVAE's + codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE). + """ + + def __init__(self, config, level): + super().__init__() + self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size) + config.embed_dim = config.music_vocab_size # setting correct argument for the `JukeboxDecoder` + + self.upsampler = JukeboxDecoderConvBock( + config, + config.hidden_size, + config.res_conv_width, + config.res_conv_depth, + config.res_downs_t[level], + config.res_strides_t[level], + reverse_dilation=False, + ) + self.layer_norm = JukeboxLayerNorm(config.hidden_size) + + def forward(self, music_tokens, raw_audio_conditioning=None): + """ + Args: + music_tokens (`torch.LongTensor`): + Music tokens form the upper level in range(nb_discrete_codes) + raw_audio_conditioning (`torch.LongTensor`, *optional*): + Audio used when primed sampling, raw audio information that conditions the generation + """ + if raw_audio_conditioning is None: + raw_audio_conditioning = 0.0 + # Embed music_tokens + music_tokens = music_tokens.long() + hidden_states = self.embed_tokens(music_tokens) + hidden_states = hidden_states + raw_audio_conditioning + + # Run conditioner + hidden_states = hidden_states.permute(0, 2, 1) + hidden_states = self.upsampler(hidden_states) + hidden_states = hidden_states.permute(0, 2, 1) + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class JukeboxRangeEmbedding(nn.Module): + """ + The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of time positional + embedding of length `n_ctx`. + + Binning process : For each pos in position tensor, find its bin [start,end) mapped to [0,1,...,bins-1] [start,end) + -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1] NOTE: Open ended interval on right, so start <= pos < end, not <= + end + """ + + def __init__(self, n_time, embed_dim, range, out_width, clamp=False): + super().__init__() + self.n_time = n_time + self.embed_dim = embed_dim + self.emb = nn.Embedding(embed_dim, out_width) + self.pos_min, self.pos_max = range + self.clamp = clamp + + def forward(self, pos_start, pos_end=None): + # Check if [pos_start,pos_end] in [pos_min, pos_max) + if not len(pos_start.shape) == 2: + raise TypeError(f"Expected shape with 2 dims, got {pos_start.shape}") + if not (self.pos_min <= pos_start).all() and (pos_start < self.pos_max).all(): + raise TypeError(f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}") + + pos_start = pos_start.float() + if pos_end is not None: + if self.clamp: + pos_end = pos_end.clamp(self.pos_min, self.pos_max) + + pos_end = pos_end.float() + # Interpolate so that [pos_start, ..., pos_end] <-> position tensor of length n_ctx + n_time = self.n_time + if n_time != 1: + interpolation = ( + torch.arange(0, n_time, dtype=torch.float, device=pos_start.device).view(1, n_time) / n_time + ) + position = pos_start + (pos_end - pos_start) * interpolation + else: + position = pos_start + + # Bin each value to bins_ + # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1 + normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min) + bins_ = (self.embed_dim * normalised_position).floor().long().detach() + return self.emb(bins_) + + +class JukeboxLabelConditioner(nn.Module): + def __init__(self, config, include_time_signal): + super().__init__() + + embed_dim = config.hidden_size + timing_dims = config.timing_dims + sampling_rate = config.sampling_rate + nb_genres, nb_artists = config.metadata_dims + music_tokens_shape = config.n_ctx + + self.max_nb_genres = config.max_nb_genres + self.bow_genre_emb = nn.Embedding(nb_genres, embed_dim) + self.artist_emb = nn.Embedding(nb_artists, embed_dim) + self.include_time_signal = include_time_signal + if self.include_time_signal: + total_length_range = (config.min_duration * sampling_rate, config.max_duration * sampling_rate) + absolute_pos_range = (0.0, config.max_duration * sampling_rate) + relative_pos_range = (0.0, 1.0) + self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, embed_dim) + self.absolute_pos_emb = JukeboxRangeEmbedding( + music_tokens_shape, timing_dims, absolute_pos_range, embed_dim + ) + self.relative_pos_emb = JukeboxRangeEmbedding( + music_tokens_shape, timing_dims, relative_pos_range, embed_dim, clamp=True + ) + + def forward(self, metadata): + total_length = metadata[:, 0:1] + offset = metadata[:, 1:2] + length = metadata[:, 2:3] + artist = metadata[:, 3:4] + genre = metadata[:, 4:] + + # Start embedding of length 1 + artist_emb = self.artist_emb(artist) + # Empty genre slots are denoted by -1. We mask these out. + mask = (genre >= 0).float().unsqueeze(2) + genre_emb = (self.bow_genre_emb(genre.clamp(0)) * mask).sum(dim=1, keepdim=True) + start_emb = genre_emb + artist_emb + + # Pos embedding of length n_ctx + if self.include_time_signal: + start, end = offset, offset + length + total_length = total_length.float() + start = start.float() + end = end.float() + pos_emb = ( + self.total_length_emb(total_length) + + self.absolute_pos_emb(start, end) + + self.relative_pos_emb(start / total_length, end / total_length) + ) + else: + pos_emb = None + return start_emb, pos_emb + + +class JukeboxPrior(PreTrainedModel): + """ + The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be + seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù + is defined, it also models the `next character` prediction on the lyrics. Can be conditioned on timing, artist, + genre, lyrics and codes from lower-levels Priors. + + Args: + config (`JukeboxPriorConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. + level (`int`, *optional*): + Current level of the Prior. Should be in range `[0,nb_priors]`. + nb_priors (`int`, *optional*, defaults to 3): + Total number of priors. + vqvae_encoder (`Callable`, *optional*): + Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of + the vqvae module to avoid getting the parameters. + vqvae_decoder (`Callable`, *optional*): + Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of + the vqvae module to avoid getting the parameters. + """ + + config: JukeboxPriorConfig + + def _init_weights(self, module): + init_scale = self.config.init_scale + + if isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=0.02 * init_scale) + elif isinstance(module, JukeboxConv1D): + if self.config.zero_out: + module.weight.data.zero_() + else: + module.weight.data.normal_(mean=0.0, std=0.02 * init_scale) + elif isinstance(module, JukeboxPositionalEmbedding): + module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale) + elif isinstance(module, JukeboxRangeEmbedding): + module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale) + elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "lm_head"): + module.lm_head.weight.data.normal_(mean=0.0, std=0.02 * init_scale) + elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "start_token"): + module.start_token.data.normal_(mean=0.0, std=0.01 * init_scale) + elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out: + module.conv1d_2.weight.data.zero_() + module.conv1d_2.bias.data.zero_() + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_encoder=None, vqvae_decoder=None): + super().__init__(config) + # Passing functions instead of the vqvae module to avoid getting params, only used in the + # forward loop + self.vqvae_encoder = vqvae_encoder + self.vqvae_decoder = vqvae_decoder + + self.levels = nb_priors + self.level = level if level is not None else config.level + + self.base_model_prefix = f"priors.{self.level}" + + self.n_ctx = config.n_ctx + + self.lyric_conditioning = config.nb_relevant_lyric_tokens > 0 + self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens + self.encoder_loss_fraction = config.encoder_loss_fraction + + # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both) + self.audio_conditioning = self.level != 0 + self.cond_level = self.level - 1 + if self.audio_conditioning: + self.conditioner_blocks = JukeboxMusicTokenConditioner(config, self.level) + + # metadata conditioning : contioning on timing, genres, and artist + self.metadata_conditioning = config.metadata_conditioning + if self.metadata_conditioning: + self.metadata_embedding = JukeboxLabelConditioner(config, include_time_signal=not self.audio_conditioning) + + # define encoder-decoder or encoder and decoder + self.is_encoder_decoder = config.is_encoder_decoder + if config.is_encoder_decoder: + # encoder-decoder transformer + self.input_shapes = [config.nb_relevant_lyric_tokens, config.n_ctx] + self.embed_dim_shift = [0, config.lyric_vocab_size] + self.width = config.hidden_size + + self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens + + self.prior = JukeboxConditionalAutoregressive( + config, + n_ctx=config.nb_relevant_lyric_tokens + config.n_ctx, + embed_dim=config.lyric_vocab_size + config.music_vocab_size, + audio_conditioning=(self.audio_conditioning or self.metadata_conditioning), + metadata_conditioning=True, + ) + + else: + # Separate encoder-decoder transformer + encoder_config = config.encoder_config + + if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning: + self.lyric_acts_width = encoder_config.hidden_size + self.encoder_width = config.hidden_size + self.encoder_dim = config.lyric_vocab_size + self.encoder = JukeboxConditionalAutoregressive( + encoder_config, + n_ctx=self.nb_relevant_lyric_tokens, + embed_dim=self.encoder_dim, + audio_conditioning=False, + metadata_conditioning=False, + is_encoder=True, + ) + self.encoder.proj_in = JukeboxConv1D(encoder_config.hidden_size, config.hidden_size) + self.encoder.final_layer_norm = JukeboxLayerNorm(config.hidden_size) + self.encoder.lm_head = nn.Linear(config.hidden_size, config.lyric_vocab_size, bias=False) + else: + self.nb_relevant_lyric_tokens = 0 + + # decoder model on the tokens + self.prior = JukeboxConditionalAutoregressive( + config, + audio_conditioning=(self.audio_conditioning or self.metadata_conditioning), + metadata_conditioning=self.metadata_conditioning, + ) + + self.next_token_prediction_loss_dims = config.n_ctx + self.total_loss_dims = self.nb_relevant_lyric_tokens + self.next_token_prediction_loss_dims + + self.downsamples = [stride**down for stride, down in zip(config.res_strides_t, config.res_downs_t)] + self.cond_downsample = self.downsamples[self.level] if self.level != 0 else None + self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - self.level]) + self.sample_length = self.n_ctx * self.raw_to_tokens + + logger.info( + f"Level:{self.level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample" + f" length:{self.sample_length}" + ) + + def get_metadata(self, labels, start, total_length, offset, get_indices=False): + metadata = labels.clone() + metadata[:, 0] = total_length + # Set sample_length to match this level + metadata[:, 2] = int(self.sample_length) + + # Set offset + metadata[:, 1:2] = int(offset * self.raw_to_tokens) + int(start * self.raw_to_tokens) + # here since metadata has the full token_list, we just need to selected the ones that are relevant + + # Set lyric tokens + metadata, indices = self.set_metadata_lyric_tokens(metadata) + if get_indices: + return metadata, indices + else: + return metadata + + def set_metadata_lyric_tokens(self, labels): + """ + Processes the full labels to only retrieve the relevant lyric tokens and keep the metadata conditioning tokens. + """ + if self.nb_relevant_lyric_tokens > 0: + tokens_list = torch.zeros( + (labels.shape[0], self.nb_relevant_lyric_tokens), dtype=torch.long, device=labels.device + ) + indices_list = [] # what's the index of each current character in original array + for idx in range(labels.shape[0]): + full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_nb_genres :] + total_length, offset, duration = labels[idx, 0], labels[idx, 1], labels[idx, 2] + tokens, indices = get_relevant_lyric_tokens( + full_tokens, self.nb_relevant_lyric_tokens, total_length, offset, duration + ) + tokens_list[idx, :] = tokens + indices_list.append(indices) + + return ( + torch.cat((labels[:, : 4 + self.metadata_embedding.max_nb_genres], tokens_list), dim=-1), + indices_list, + ) + else: + return labels, None + + def get_music_tokens_conds(self, music_tokens, start, end): + """ + Extracts current level's conditioning music tokens. + """ + if self.level != 0: + music_tokens_cond = music_tokens[self.level - 1] + music_tokens = music_tokens_cond[:, start // self.cond_downsample : end // self.cond_downsample] + missing_cond_len = self.n_ctx // self.cond_downsample - music_tokens_cond[-1].shape[-1] + if missing_cond_len > 0: + init_cond = torch.zeros(1, missing_cond_len).to(music_tokens_cond.device) + music_tokens_cond = torch.cat((music_tokens_cond, init_cond), dim=-1).long() + music_tokens_conds = [music_tokens_cond] + else: + music_tokens_conds = None + return music_tokens_conds + + def prior_preprocess(self, tokens, conds): + """ + Shifts the input tokens to account for the dictionary merge. The embed_dim_shift give by how much the music + tokens should be shifted by. It is equal to `lyric_vocab_size`. + """ + batch_size = tokens[0].shape[0] + for i in range(len(tokens)): + tokens[i] = (tokens[i] + int(self.embed_dim_shift[i])).view(batch_size, -1) + + for i in range(len(conds)): + if conds[i] is None: + conds[i] = torch.zeros( + (batch_size, self.input_shapes[i], self.width), dtype=tokens[0].dtype, device=tokens[0].device + ) + + return torch.cat(tokens, dim=1), torch.cat(conds, dim=1) + + def prior_postprocess(self, tokens): + """ + Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is + shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music + tokens. + """ + batch_size = tokens.shape[0] + dims = (self.input_shapes[0], tokens.shape[1] - self.input_shapes[0]) + tokens = list(torch.split(tokens, dims, dim=1)) + + # Some of the input tokens might be shifted to take into account the voccabulary fusion + for i in range(len(tokens)): + bins_shift = int(self.embed_dim_shift[i]) + tokens[i] = (tokens[i] - bins_shift).view(batch_size, -1) + tokens[i] = torch.clamp(tokens[i], min=0) + # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift + return tokens[-1] + + def embed_tokens(self, music_tokens_conds): + """ + Embeds the upper level music tokens and upsamples them to provide as audio conditioning. + """ + music_tokens_conds = music_tokens_conds[: self.cond_level + 1] + audio_conditioning = None + for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, [self.conditioner_blocks]))): + audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning) + return audio_conditioning + + def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1): + """ + Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states. + """ + if start_level is None: + start_level = self.level + if end_level is None: + end_level = self.levels + # Get latents + with torch.no_grad(): + latent_states = self.vqvae_encoder( + hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks + ) + return latent_states + + def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1): + """ + Usamples the sequence of codebook vectors to a raw audio. + """ + if start_level is None: + start_level = self.level + if end_level is None: + end_level = self.levels + with torch.no_grad(): + output = self.vqvae_decoder( + music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks + ) + return output + + def get_cond(self, music_tokens_conds, metadata): + """ + Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens + can be None. + """ + if metadata is not None: + n_labels = metadata.shape[1] - self.nb_relevant_lyric_tokens + metadata, lyric_tokens = metadata[:, :n_labels], metadata[:, n_labels:] + else: + metadata, lyric_tokens = None, None + metadata_conditioning, metadata_pos = ( + self.metadata_embedding(metadata) if self.metadata_conditioning else (None, None) + ) + audio_conditioning = self.embed_tokens(music_tokens_conds) if self.audio_conditioning else metadata_pos + return audio_conditioning, metadata_conditioning, lyric_tokens + + def sample( + self, + n_samples, + music_tokens=None, + music_tokens_conds=None, + metadata=None, + temp=1.0, + top_k=0, + top_p=0.0, + chunk_size=None, + sample_tokens=None, + ): + """ + Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas. + + Args: + n_samples (`int`): + Number of samples to generate. + music_tokens (`list[torch.LongTensor]`, *optional*): + Previously generated tokens at the current level. Used as context for the generation. + music_tokens_conds (`list[torch.FloatTensor]`, *optional*): + Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not + conditioned on the upper-level tokens. + metadata (`list[torch.LongTensor]`, *optional*): + List containing the metadata tensor with the artist, genre and the lyric tokens. + temp (`float`, *optional*, defaults to 1.0): + Sampling temperature. + top_k (`int`, *optional*, defaults to 0): + Top k probabilities used for filtering. + top_p (`float`, *optional*, defaults to 0.0): + Top p probabilities used for filtering. + chunk_size (`int`, *optional*): + Size of the chunks used to prepare the cache of the transformer. + sample_tokens (`int`, *optional*): + Number of tokens to sample. + + """ + no_past_context = music_tokens is None or music_tokens.shape[1] == 0 + name = {True: "Ancestral", False: "Primed"}[no_past_context] + logger.info(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}") + + with torch.no_grad(): + # Currently audio_conditioning only uses immediately above layer + audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata) + if self.is_encoder_decoder: + if no_past_context: # the prime_sample function will be used with music_tokens set to None + lyric_and_music_tokens, audio_conditioning = self.prior_preprocess( + [lyric_tokens], [None, audio_conditioning] + ) + else: + lyric_and_music_tokens, audio_conditioning = self.prior_preprocess( + [lyric_tokens, music_tokens], [None, audio_conditioning] + ) + if sample_tokens is not None: + sample_tokens += self.nb_relevant_lyric_tokens + music_tokens = self.prior.primed_sample( + n_samples, + lyric_and_music_tokens, + audio_conditioning, + metadata_conditioning, + temp=temp, + top_k=top_k, + top_p=top_p, + chunk_size=chunk_size, + sample_tokens=sample_tokens, + ) + music_tokens = self.prior_postprocess(music_tokens) + else: + last_encoder_hidden_states = self.get_encoder_states(lyric_tokens, sample=True) + if no_past_context: + music_tokens = self.prior.sample( + n_samples, + audio_conditioning, + metadata_conditioning, + last_encoder_hidden_states, + temp=temp, + top_k=top_k, + top_p=top_p, + sample_tokens=sample_tokens, + ) + else: + music_tokens = self.prior.primed_sample( + n_samples, + music_tokens, + audio_conditioning, + metadata_conditioning, + last_encoder_hidden_states, + temp=temp, + top_k=top_k, + top_p=top_p, + chunk_size=chunk_size, + sample_tokens=sample_tokens, + ) + return music_tokens + + def get_encoder_states(self, lyric_tokens, sample=False): + """ + Retrieve the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through + the lyric encoder. + """ + if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning: + if sample: + self.encoder = self.encoder.to(lyric_tokens.device) + lyric_acts = self.encoder(lyric_tokens, None, None, None) + lyric_acts = self.encoder.proj_in(lyric_acts) + last_encoder_hidden_states = self.encoder.final_layer_norm(lyric_acts) + else: + last_encoder_hidden_states = None + return last_encoder_hidden_states + + def get_encoder_loss(self, last_encoder_hidden_states, target_lyrics): + """ + Computes the loss for the lyric encoder: next lyric token prediction. + """ + if self.lyric_conditioning: + last_encoder_hidden_states = self.encoder.lm_head(last_encoder_hidden_states) + encoder_loss = nn.functional.cross_entropy( + last_encoder_hidden_states.view(-1, self.encoder_dim), target_lyrics.view(-1) + ) / np.log(2.0) + else: + encoder_loss = torch.tensor(0.0, device=last_encoder_hidden_states.device) + return encoder_loss + + def forward_tokens( + self, music_tokens, music_tokens_conds=[], metadata=None, get_preds=False, get_attn_weights=False + ): + """ + Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the + vqvae's encoding layers. + """ + if get_attn_weights: + self.prior.transformer.set_record_attn(get_attn_weights) + audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata) + + if self.is_encoder_decoder: # the preprocess returns the full tokens (Lyrics and Music tokens), shifted + tokens, audio_conditioning = self.prior_preprocess( + [lyric_tokens, music_tokens], [None, audio_conditioning] + ) + (encoder_loss, next_token_prediction_loss), preds = self.prior( + tokens, audio_conditioning, metadata_conditioning, get_sep_loss=True, get_preds=get_preds + ) + else: + last_encoder_hidden_states = self.get_encoder_states(lyric_tokens) + encoder_loss = self.get_encoder_loss(last_encoder_hidden_states, lyric_tokens) + next_token_prediction_loss, preds = self.prior( + music_tokens, + audio_conditioning, + metadata_conditioning, + last_encoder_hidden_states, + get_preds=get_preds, + ) + loss = self.encoder_loss_fraction * encoder_loss * self.nb_relevant_lyric_tokens / self.total_loss_dims + loss += next_token_prediction_loss * self.next_token_prediction_loss_dims / self.total_loss_dims + + metrics = { + "bpd": next_token_prediction_loss.detach().clone(), + "encoder_loss": encoder_loss.detach().clone(), + "next_token_prediction_loss": next_token_prediction_loss.detach().clone(), + } + if get_preds: + metrics["preds"] = preds.detach().clone() + if get_attn_weights: + saved_attn_weights = self.prior.transformer.saved_attn_weights + self.prior.transformer.set_record_attn(False) + return saved_attn_weights + else: + return loss, metrics + + def forward( + self, + hidden_states: torch.Tensor, + metadata: Optional[list[torch.LongTensor]], + decode: Optional[bool] = False, + get_preds: Optional[bool] = False, + ) -> list[torch.Tensor]: + """ + Encode the hidden states using the `vqvae` encoder, and then predicts the next token in the `forward_tokens` + function. The loss is the sum of the `encoder` loss and the `decoder` loss. + + Args: + hidden_states (`torch.Tensor`): + Hidden states which should be raw audio + metadata (`list[torch.LongTensor]`, *optional*): + List containing the metadata conditioning tensor with the lyric and the metadata tokens. + decode (`bool`, *optional*, defaults to `False`): + Whether or not to decode the encoded to tokens. + get_preds (`bool`, *optional*, defaults to `False`): + Whether or not to return the actual predictions of the model. + """ + batch_size = hidden_states.shape[0] + music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size) + loss, metrics = self.forward_tokens( + music_tokens=music_tokens, + music_tokens_conds=music_tokens_conds, + metadata=metadata, + get_preds=get_preds, + ) + if decode: + dequantised_states = self.decode([music_tokens, *music_tokens_conds]) + else: + dequantised_states = None + return dequantised_states, loss, metrics + + +class JukeboxPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: JukeboxConfig + base_model_prefix = "jukebox" + supports_gradient_checkpointing = False + + def _init_weights(self, module): + if isinstance(module, (JukeboxPrior, JukeboxVQVAE)): + module.apply(module._init_weights) + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + +JUKEBOX_SAMPLING_INPUT_DOCSTRING = r""" + labels (`list[torch.LongTensor]` of length `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_length)` : + List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to + condition the generation. + sampling_kwargs (`dict[Any]`): + Various additional sampling arguments that are used by the `_sample` function. A detail list of the + arguments can bee seen in the [`_sample`] function documentation. +""" + + +@add_start_docstrings( + """The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`, + `continue_sample` and `ancestral_sample`. It does not have a `forward` method as the training is not end to end. If + you want to fine-tune the model, it is recommended to use the `JukeboxPrior` class and train each prior + individually. + """, + JUKEBOX_START_DOCSTRING, +) +class JukeboxModel(JukeboxPreTrainedModel): + _no_split_modules = ["JukeboxBlock"] + + def __init__(self, config): + super().__init__(config) + vqvae_config = config.vqvae_config + self.vqvae = JukeboxVQVAE(vqvae_config) + self.set_shared_params(config) + self.priors = nn.ModuleList( + [JukeboxPrior(config.prior_configs[level], level) for level in range(config.nb_priors)] + ) + + def set_shared_params(self, model_config): + """ + Initialises the parameters that are shared. This has to be done here because the list of `JukeboxPriorConfig` + is nest, and is thus unreachable in the `from_dict` function + """ + for config in model_config.prior_configs: + config.sampling_rate = model_config.sampling_rate + config.timing_dims = model_config.timing_dims + config.min_duration = model_config.min_duration + config.max_duration = model_config.max_duration + config.max_nb_genres = model_config.max_nb_genres + config.metadata_conditioning = model_config.metadata_conditioning + + def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1): + return self.vqvae.decode(music_tokens, start_level, end_level, bs_chunks) + + def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1): + return self.vqvae.encode(input_audio, start_level, end_level, bs_chunks) + + def split_batch(self, obj, n_samples, split_size): + n_passes = (n_samples + split_size - 1) // split_size + if isinstance(obj, torch.Tensor): + return torch.split(obj, split_size, dim=0) + elif isinstance(obj, list): + return list(zip(*[torch.split(item, split_size, dim=0) for item in obj])) + elif obj is None: + return [None] * n_passes + else: + raise TypeError("Unknown input type") + + # Sample a partial window of length= self.priors[level].n_ctx: + iterator = get_starts(total_length, self.priors[level].n_ctx, hop_length) + for start in iterator: + music_tokens = self.sample_single_window( + music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size + ) + + else: + music_tokens = self.sample_partial_window( + music_tokens, labels, offset, sampling_kwargs, level, total_length, max_batch_size + ) + return music_tokens + + @torch.no_grad() + def _sample( + self, + music_tokens, + labels, + sample_levels, + metas=None, + chunk_size=32, + sampling_temperature=0.98, + lower_batch_size=16, + max_batch_size=16, + sample_length_in_seconds=24, + compute_alignments=False, + sample_tokens=None, + offset=0, + save_results=True, + sample_length=None, + ) -> list[torch.LongTensor]: + """ + Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving + the generated raw audio at each step. + + Args: + music_tokens (`list[torch.LongTensor]`): + A sequence of music tokens of length `self.levels` which will be used as context to continue the + sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain + level. + labels (`list[torch.LongTensor]`): + List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre + + lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens + which are used to condition the generation. + sample_levels (`list[int]`): + List of the desired levels at which the sampling will be done. A level is equivalent to the index of + the prior in the list of priors + metas (`list[Any]`, *optional*): + Metadatas used to generate the `labels` + chunk_size (`int`, *optional*, defaults to 32): + Size of a chunk of audio, used to fill up the memory in chunks to prevent OOM errors. Bigger chunks + means faster memory filling but more consumption. + sampling_temperature (`float`, *optional*, defaults to 0.98): + Temperature used to adjust the randomness of the sampling. + lower_batch_size (`int`, *optional*, defaults to 16): + Maximum batch size for the lower level priors + max_batch_size (`int`, *optional*, defaults to 16): + Maximum batch size for the top level priors + sample_length_in_seconds (`int`, *optional*, defaults to 24): + Desired length of the generation in seconds + compute_alignments (`bool`, *optional*, defaults to `False`): + Whether or not to compute the alignment between the lyrics and the audio using the top_prior + sample_tokens (`int`, *optional*): + Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy + experiments + offset (`int`, *optional*, defaults to 0): + Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is + greater than 0, the lyrics will be shifted take that intoaccount + save_results (`bool`, *optional*, defaults to `True`): + Whether or not to save the intermediate results. If `True`, will generate a folder named with the start + time. + sample_length (`int`, *optional*): + Desired length of the generation in samples. + + Returns: torch.Tensor + + Example: + + ```python + >>> from transformers import AutoTokenizer, JukeboxModel, set_seed + >>> import torch + + >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics") + >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval() + + >>> labels = tokenizer(**metas)["input_ids"] + >>> set_seed(0) + >>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)] + >>> zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False) + >>> zs[0] + tensor([[1853, 1369, 1150, 1869, 1379, 1789, 519, 710, 1306, 1100, 1229, 519, + 353, 1306, 1379, 1053, 519, 653, 1631, 1467, 1229, 1229, 10, 1647, + 1254, 1229, 1306, 1528, 1789, 216, 1631, 1434, 653, 475, 1150, 1528, + 1804, 541, 1804, 1434]]) + ``` + """ + + top_prior = self.priors[0] + if sample_length is not None: + total_length = sample_length + else: + total_length = ( + int(sample_length_in_seconds * self.config.sampling_rate) // top_prior.raw_to_tokens + ) * top_prior.raw_to_tokens + + if sample_levels is None: + sample_levels = range(len(self.priors)) + + # total length of the signal, might be bit different from the actual generated length + self.total_length = total_length + for level in sample_levels: + sampling_kwargs = { + "temp": 0.99 if level == len(self.priors) - 1 else sampling_temperature, + "chunk_size": chunk_size, + "sample_tokens": sample_tokens, + } + # Set correct total_length, hop_length, labels and sampling_kwargs for level + + total_token_to_sample = total_length // self.priors[level].raw_to_tokens + hop_length = int(self.config.hop_fraction[level] * self.priors[level].n_ctx) + max_batch_size = lower_batch_size if level != sample_levels else max_batch_size + music_tokens = self.sample_level( + music_tokens, + labels[level], + offset, + sampling_kwargs, + level, + total_token_to_sample, + hop_length, + max_batch_size, + ) + + if save_results: + self.vqvae.to(music_tokens[level].device) + # Decode sample + with torch.no_grad(): + start_level = len(self.priors) - level - 1 # vqvae levels are reversed + raw_audio = self.vqvae.decode( + music_tokens[: level + 1], start_level=start_level, bs_chunks=music_tokens[level].shape[0] + ) + logdir = f"jukebox/level_{level}" + if not os.path.exists(logdir): + os.makedirs(logdir) + save_temp_audio(logdir, level, metas=metas, aud=raw_audio.float()) + if compute_alignments and self.priors[0] is not None and self.priors[0].nb_relevant_lyric_tokens > 0: + with torch.no_grad(): + alignments = get_alignment(music_tokens, labels[0], self.priors[0], self.config) + torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt") + + return music_tokens + + @add_start_docstrings( + """ + Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically + upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use + the VQ-VAE decoder to convert the music tokens to raw audio. + + Args: + labels (`list[torch.LongTensor]`) : + List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre + + lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens + which are used to condition the generation. + n_samples (`int`, *optional*, default to 1) : + Number of samples to be generated in parallel. + """, + ) + def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> list[torch.LongTensor]: + """ + Example: + + ```python + >>> from transformers import AutoTokenizer, JukeboxModel, set_seed + + >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval() + >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics") + + >>> lyrics = "Hey, are you awake? Can you talk to me?" + >>> artist = "Zac Brown Band" + >>> genre = "Country" + >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics) + >>> set_seed(0) + >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400) + + >>> with torch.no_grad(): + ... model.decode(music_tokens)[:, :10].squeeze(-1) + tensor([[-0.0219, -0.0679, -0.1050, -0.1203, -0.1271, -0.0936, -0.0396, -0.0405, + -0.0818, -0.0697]]) + ``` + """ + + sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors)))) + music_tokens = [ + torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors)) + ] + music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs) + return music_tokens + + @add_start_docstrings( + """Generates a continuation of the previously generated tokens. + + Args: + music_tokens (`list[torch.LongTensor]` of length `self.levels` ) : + A sequence of music tokens which will be used as context to continue the sampling process. Should have + `self.levels` tensors, each corresponding to the generation at a certain level. + """, + JUKEBOX_SAMPLING_INPUT_DOCSTRING, + ) + def continue_sample(self, music_tokens, labels, **sampling_kwargs) -> list[torch.LongTensor]: + sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors)))) + music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs) + return music_tokens + + @add_start_docstrings( + """Upsamples a sequence of music tokens using the prior at level `level`. + + Args: + music_tokens (`list[torch.LongTensor]` of length `self.levels` ) : + A sequence of music tokens which will be used as context to continue the sampling process. Should have + `self.levels` tensors, each corresponding to the generation at a certain level. + """, + JUKEBOX_SAMPLING_INPUT_DOCSTRING, + ) + def upsample(self, music_tokens, labels, **sampling_kwargs) -> list[torch.LongTensor]: + sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors) - 1))) + music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs) + return music_tokens + + @add_start_docstrings( + """Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the + generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are + used: as conditioning for each level, which means that no ancestral sampling is required. + + Args: + raw_audio (`list[torch.Tensor]` of length `n_samples` ) : + A list of raw audio that will be used as conditioning information for each samples that will be + generated. + """, + JUKEBOX_SAMPLING_INPUT_DOCSTRING, + ) + def primed_sample(self, raw_audio, labels, **sampling_kwargs) -> list[torch.LongTensor]: + sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors)))) + self.vqvae.to(raw_audio.device).float() + with torch.no_grad(): + music_tokens = self.vqvae.encode( + raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.shape[0] + ) + music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs) + return music_tokens + + +__all__ = ["JukeboxModel", "JukeboxPreTrainedModel", "JukeboxVQVAE", "JukeboxPrior"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py new file mode 100644 index 0000000000000000000000000000000000000000..ec2162db2cce52876fee37d9806c89b4dbabca2f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py @@ -0,0 +1,407 @@ +# coding=utf-8 +# Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI Jukebox.""" + +import json +import os +import re +import unicodedata +from json.encoder import INFINITY +from typing import Any, Optional, Union + +import numpy as np +import regex + +from ....tokenization_utils import AddedToken, PreTrainedTokenizer +from ....tokenization_utils_base import BatchEncoding +from ....utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging +from ....utils.generic import _is_jax, _is_numpy + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "artists_file": "artists.json", + "lyrics_file": "lyrics.json", + "genres_file": "genres.json", +} + + +class JukeboxTokenizer(PreTrainedTokenizer): + """ + Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs : + - Artists, unique ids are associated to each artist from the provided dictionary. + - Genres, unique ids are associated to each genre from the provided dictionary. + - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the + vocabulary. + + This tokenizer does not require training. It should be able to process a different number of inputs: + as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.: + + Depending on the number of genres on which the model should be conditioned (`n_genres`). + ```python + >>> from transformers import JukeboxTokenizer + + >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics") + >>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"] + [tensor([[ 0, 0, 0, 6785, 546, 41, 38, 30, 76, 46, 41, 49, + 40, 76, 44, 41, 27, 30]]), tensor([[ 0, 0, 0, 145, 0]]), tensor([[ 0, 0, 0, 145, 0]])] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + If nothing is provided, the genres and the artist will either be selected randomly or set to None + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to: + this superclass for more information regarding those methods. + + However the code does not allow that and only supports composing from various genres. + + Args: + artists_file (`str`): + Path to the vocabulary file which contains a mapping between artists and ids. The default file supports + both "v2" and "v3" + genres_file (`str`): + Path to the vocabulary file which contain a mapping between genres and ids. + lyrics_file (`str`): + Path to the vocabulary file which contains the accepted characters for the lyrics tokenization. + version (`list[str]`, `optional`, default to `["v3", "v2", "v2"]`) : + List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of + `v2`. + n_genres (`int`, `optional`, defaults to 1): + Maximum number of genres to use for composition. + max_n_lyric_tokens (`int`, `optional`, defaults to 512): + Maximum number of lyric tokens to keep. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + artists_file, + genres_file, + lyrics_file, + version=["v3", "v2", "v2"], + max_n_lyric_tokens=512, + n_genres=5, + unk_token="<|endoftext|>", + **kwargs, + ): + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + self.version = version + self.max_n_lyric_tokens = max_n_lyric_tokens + self.n_genres = n_genres + self._added_tokens_decoder = {0: unk_token} + + with open(artists_file, encoding="utf-8") as vocab_handle: + self.artists_encoder = json.load(vocab_handle) + + with open(genres_file, encoding="utf-8") as vocab_handle: + self.genres_encoder = json.load(vocab_handle) + + with open(lyrics_file, encoding="utf-8") as vocab_handle: + self.lyrics_encoder = json.load(vocab_handle) + + oov = r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+" + # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters. + if len(self.lyrics_encoder) == 79: + oov = oov.replace(r"\-'", r"\-+'") + + self.out_of_vocab = regex.compile(oov) + self.artists_decoder = {v: k for k, v in self.artists_encoder.items()} + self.genres_decoder = {v: k for k, v in self.genres_encoder.items()} + self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()} + super().__init__( + unk_token=unk_token, + n_genres=n_genres, + version=version, + max_n_lyric_tokens=max_n_lyric_tokens, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder) + + def get_vocab(self): + return { + "artists_encoder": self.artists_encoder, + "genres_encoder": self.genres_encoder, + "lyrics_encoder": self.lyrics_encoder, + } + + def _convert_token_to_id(self, list_artists, list_genres, list_lyrics): + """Converts the artist, genre and lyrics tokens to their index using the vocabulary. + The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to + the lyrics token sequence. + """ + artists_id = [self.artists_encoder.get(artist, 0) for artist in list_artists] + for genres in range(len(list_genres)): + list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]] + list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres])) + + lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[0]], [], []] + return artists_id, list_genres, lyric_ids + + def _tokenize(self, lyrics): + """ + Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based + vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). + + Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary. + """ + # only lyrics are not tokenized, but character based is easily handled + return list(lyrics) + + def tokenize(self, artist, genre, lyrics, **kwargs): + """ + Converts three strings in a 3 sequence of tokens using the tokenizer + """ + artist, genre, lyrics = self.prepare_for_tokenization(artist, genre, lyrics) + lyrics = self._tokenize(lyrics) + return artist, genre, lyrics + + def prepare_for_tokenization( + self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False + ) -> tuple[str, str, str, dict[str, Any]]: + """ + Performs any necessary transformations before tokenization. + + Args: + artist (`str`): + The artist name to prepare. This will mostly lower the string + genres (`str`): + The genre name to prepare. This will mostly lower the string. + lyrics (`str`): + The lyrics to prepare. + is_split_into_words (`bool`, *optional*, defaults to `False`): + Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the + tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) + which it will tokenize. This is useful for NER or token classification. + """ + for idx in range(len(self.version)): + if self.version[idx] == "v3": + artists[idx] = artists[idx].lower() + genres[idx] = [genres[idx].lower()] + else: + artists[idx] = self._normalize(artists[idx]) + ".v2" + genres[idx] = [ + self._normalize(genre) + ".v2" for genre in genres[idx].split("_") + ] # split is for the full dictionary with combined genres + + if self.version[0] == "v2": + self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+") + vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n" + self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))} + self.vocab[""] = 0 + self.n_vocab = len(vocab) + 1 + self.lyrics_encoder = self.vocab + self.lyrics_decoder = {v: k for k, v in self.vocab.items()} + self.lyrics_decoder[0] = "" + else: + self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+") + + lyrics = self._run_strip_accents(lyrics) + lyrics = lyrics.replace("\\", "\n") + lyrics = self.out_of_vocab.sub("", lyrics), [], [] + return artists, genres, lyrics + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _normalize(self, text: str) -> str: + """ + Normalizes the input text. This process is for the genres and the artist + + Args: + text (`str`): + Artist or Genre string to normalize + """ + + accepted = ( + [chr(i) for i in range(ord("a"), ord("z") + 1)] + + [chr(i) for i in range(ord("A"), ord("Z") + 1)] + + [chr(i) for i in range(ord("0"), ord("9") + 1)] + + ["."] + ) + accepted = frozenset(accepted) + pattern = re.compile(r"_+") + text = "".join([c if c in accepted else "_" for c in text.lower()]) + text = pattern.sub("_", text).strip("_") + return text + + def convert_lyric_tokens_to_string(self, lyrics: list[str]) -> str: + return " ".join(lyrics) + + def convert_to_tensors( + self, inputs, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False + ): + """ + Convert the inner content to tensors. + + Args: + tensor_type (`str` or [`~utils.TensorType`], *optional*): + The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If + unset, no modification is done. + prepend_batch_axis (`int`, *optional*, defaults to `False`): + Whether or not to add the batch dimension during the conversion. + """ + # Convert to TensorType + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + # Get a function reference for the correct framework + if tensor_type == TensorType.TENSORFLOW: + if not is_tf_available(): + raise ImportError( + "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." + ) + import tensorflow as tf + + as_tensor = tf.constant + is_tensor = tf.is_tensor + elif tensor_type == TensorType.PYTORCH: + if not is_torch_available(): + raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + import torch + + as_tensor = torch.tensor + is_tensor = torch.is_tensor + elif tensor_type == TensorType.JAX: + if not is_flax_available(): + raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") + import jax.numpy as jnp # noqa: F811 + + as_tensor = jnp.array + is_tensor = _is_jax + else: + as_tensor = np.asarray + is_tensor = _is_numpy + + # Do the tensor conversion in batch + + try: + if prepend_batch_axis: + inputs = [inputs] + + if not is_tensor(inputs): + inputs = as_tensor(inputs) + except: # noqa E722 + raise ValueError( + "Unable to create tensor, you should probably activate truncation and/or padding " + "with 'padding=True' 'truncation=True' to have batched tensors with the same length." + ) + + return inputs + + def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncoding: + """Convert the raw string to a list of token ids + + Args: + artist (`str`): + Name of the artist. + genres (`str`): + List of genres that will be mixed to condition the audio + lyrics (`str`, *optional*, defaults to `""`): + Lyrics used to condition the generation + """ + input_ids = [0, 0, 0] + artist = [artist] * len(self.version) + genres = [genres] * len(self.version) + + artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics) + artists_id, genres_ids, full_tokens = self._convert_token_to_id(artists_tokens, genres_tokens, lyrics_tokens) + + attention_masks = [-INFINITY] * len(full_tokens[-1]) + input_ids = [ + self.convert_to_tensors( + [input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]], tensor_type=return_tensors + ) + for i in range(len(self.version)) + ] + return BatchEncoding({"input_ids": input_ids, "attention_masks": attention_masks}) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + """ + Saves the tokenizer's vocabulary dictionary to the provided save_directory. + + Args: + save_directory (`str`): + A path to the directory where to saved. It will be created if it doesn't exist. + + filename_prefix (`Optional[str]`, *optional*): + A prefix to add to the names of the files saved by the tokenizer. + + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + + artists_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["artists_file"] + ) + with open(artists_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.artists_encoder, ensure_ascii=False)) + + genres_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["genres_file"] + ) + with open(genres_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.genres_encoder, ensure_ascii=False)) + + lyrics_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["lyrics_file"] + ) + with open(lyrics_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False)) + + return (artists_file, genres_file, lyrics_file) + + def _convert_id_to_token(self, artists_index, genres_index, lyric_index): + """ + Converts an index (integer) in a token (str) using the vocab. + + Args: + artists_index (`int`): + Index of the artist in its corresponding dictionary. + genres_index (`Union[list[int], int]`): + Index of the genre in its corresponding dictionary. + lyric_index (`list[int]`): + List of character indices, which each correspond to a character. + """ + artist = self.artists_decoder.get(artists_index) + genres = [self.genres_decoder.get(genre) for genre in genres_index] + lyrics = [self.lyrics_decoder.get(character) for character in lyric_index] + return artist, genres, lyrics + + +__all__ = ["JukeboxTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53ec5ed37c13614266d04cde838ff7360946a451 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mctct import * + from .feature_extraction_mctct import * + from .modeling_mctct import * + from .processing_mctct import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py new file mode 100644 index 0000000000000000000000000000000000000000..984dca4a62bfa726bae9081183cd18853381d3ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py @@ -0,0 +1,184 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""M-CTC-T model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class MCTCTConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MCTCTModel`]. It is used to instantiate an + M-CTC-T model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the M-CTC-T + [speechbrain/m-ctc-t-large](https://huggingface.co/speechbrain/m-ctc-t-large) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 8065): + Vocabulary size of the M-CTC-T model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MCTCTModel`]. + hidden_size (`int`, *optional*, defaults to 1536): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 36): + Number of hidden layers in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 4): + Number of attention heads for each attention layer in the Transformer encoder. + attention_head_dim (`int`, *optional*, defaults to 384): + Dimensions of each attention head for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 920): + The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + layerdrop (`float`, *optional*, defaults to 0.3): + The probability of dropping an encoder layer during training. The default 0.3 value is used in the original + implementation. + hidden_act (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + hidden_dropout_prob (`float`, *optional*, defaults to 0.3): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3): + The dropout ratio for the attention probabilities. + pad_token_id (`int`, *optional*, defaults to 1): + The tokenizer index of the pad token. + bos_token_id (`int`, *optional*, defaults to 0): + The tokenizer index of the bos token. + eos_token_id (`int`, *optional*, defaults to 2): + The tokenizer index of the eos token. + conv_glu_dim (`int`, *optional*, defaults to 1): + The dimension of the output of the `Conv1dSubsampler` layer in which GLU is applied on. Though the original + Flashlight code uses the value of 2, here it's adapted to 1 due to transposition differences. + conv_dropout (`int`, *optional*, defaults to 0.3): + The probability of randomly dropping the `Conv1dSubsampler` layer during training. + num_conv_layers (`int`, *optional*, defaults to 1): + Number of convolution layers before applying transformer encoder layers. + conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`): + The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal + to `num_conv_layers`. + conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`): + The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal + to `num_conv_layers`. + input_feat_per_channel (`int`, *optional*, defaults to 80): + Feature dimensions of the channels of the input to the Conv1D layer. + input_channels (`int`, *optional*, defaults to 1): + Number of input channels of the input to the Conv1D layer. + conv_channels (`list[int]`, *optional*): + Channel sizes of intermediate Conv1D layers. + ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`MCTCTForCTC`]. + ctc_zero_infinity (`bool`, *optional*, defaults to `False`): + Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`MCTCTForCTC`]. + + Example: + + ```python + >>> from transformers import MCTCTConfig, MCTCTModel + + >>> # Initializing a M-CTC-T mctct-large style configuration + >>> configuration = MCTCTConfig() + + >>> # Initializing a model (with random weights) from the mctct-large style configuration + >>> model = MCTCTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mctct" + + def __init__( + self, + vocab_size=8065, + hidden_size=1536, + num_hidden_layers=36, + intermediate_size=6144, + num_attention_heads=4, + attention_head_dim=384, + max_position_embeddings=920, + layer_norm_eps=1e-5, + layerdrop=0.3, + hidden_act="relu", + initializer_range=0.02, + hidden_dropout_prob=0.3, + attention_probs_dropout_prob=0.3, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + conv_glu_dim=1, + conv_dropout=0.3, + num_conv_layers=1, + conv_kernel=(7,), + conv_stride=(3,), + input_feat_per_channel=80, + input_channels=1, + conv_channels=None, + ctc_loss_reduction="sum", + ctc_zero_infinity=False, + **kwargs, + ): + super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + self.max_position_embeddings = max_position_embeddings + self.layer_norm_eps = layer_norm_eps + self.layerdrop = layerdrop + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.conv_glu_dim = conv_glu_dim + self.conv_dropout = conv_dropout + self.num_conv_layers = num_conv_layers + self.input_feat_per_channel = input_feat_per_channel + self.input_channels = input_channels + self.conv_channels = conv_channels + self.ctc_loss_reduction = ctc_loss_reduction + self.ctc_zero_infinity = ctc_zero_infinity + + # prevents config testing fail with exporting to json + self.conv_kernel = list(conv_kernel) + self.conv_stride = list(conv_stride) + + if len(self.conv_kernel) != self.num_conv_layers: + raise ValueError( + "Configuration for convolutional module is incorrect. " + "It is required that `len(config.conv_kernel)` == `config.num_conv_layers` " + f"but is `len(config.conv_kernel) = {len(self.conv_kernel)}`, " + f"`config.num_conv_layers = {self.num_conv_layers}`." + ) + + +__all__ = ["MCTCTConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py new file mode 100644 index 0000000000000000000000000000000000000000..966a160f91b07177b16043a00ccd76685926a422 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py @@ -0,0 +1,291 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Feature extractor class for M-CTC-T +""" + +from typing import Optional, Union + +import numpy as np + +from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function +from ....feature_extraction_sequence_utils import SequenceFeatureExtractor +from ....feature_extraction_utils import BatchFeature +from ....file_utils import PaddingStrategy, TensorType +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class MCTCTFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a M-CTC-T feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. This + code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to + this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an) + that takes the user step-by-step in the implementation. + + Args: + feature_size (`int`, defaults to 80): + The feature dimension of the extracted features. This is the number of mel_frequency + sampling_rate (`int`, defaults to 16000): + The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). + padding_value (`float`, defaults to 0.0): + The value that is used to fill the padding values. + hop_length (`int`, defaults to 10): + Number of audio samples between windows. Otherwise referred to as "shift" in many papers. + win_length (`int`, defaults to 25): + Number of ms per window + win_function (`str`, defaults to `"hamming_window"`): + Name for the window function used for windowing, must be accessible via `torch.{win_function}` + frame_signal_scale (`float`, defaults to 32768.0): + Constant multiplied in creating the frames before applying DFT. + preemphasis_coeff (`float`, defaults to 0.97): + Constant multiplied in applying Pre-emphasis before DFT. + mel_floor (`float` defaults to 1.0): + Minimum value of mel frequency banks. + normalize_means (`bool`, *optional*, defaults to `True`): + Whether or not to zero-mean normalize the extracted features. + normalize_vars (`bool`, *optional*, defaults to `True`): + Whether or not to unit-variance normalize the extracted features. + """ + + model_input_names = ["input_features", "attention_mask"] + + def __init__( + self, + feature_size=80, + sampling_rate=16000, + padding_value=0.0, + hop_length=10, + win_length=25, + win_function="hamming_window", + frame_signal_scale=32768.0, + preemphasis_coeff=0.97, + mel_floor=1.0, + normalize_means=True, + normalize_vars=True, + return_attention_mask=False, + **kwargs, + ): + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + + self.feature_size = feature_size + self.sampling_rate = sampling_rate + self.padding_value = padding_value + self.hop_length = hop_length + self.win_length = win_length + self.frame_signal_scale = frame_signal_scale + self.preemphasis_coeff = preemphasis_coeff + self.mel_floor = mel_floor + self.normalize_means = normalize_means + self.normalize_vars = normalize_vars + self.win_function = win_function + self.return_attention_mask = return_attention_mask + + self.sample_size = win_length * sampling_rate // 1000 + self.sample_stride = hop_length * sampling_rate // 1000 + + self.n_fft = optimal_fft_length(self.sample_size) + self.n_freqs = (self.n_fft // 2) + 1 + + def _extract_mfsc_features(self, one_waveform: np.ndarray) -> np.ndarray: + """ + Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code. + """ + if self.win_function == "hamming_window": + window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False) + else: + window = window_function(window_length=self.sample_size, name=self.win_function) + + fbanks = mel_filter_bank( + num_frequency_bins=self.n_freqs, + num_mel_filters=self.feature_size, + min_frequency=0.0, + max_frequency=self.sampling_rate / 2.0, + sampling_rate=self.sampling_rate, + ) + + msfc_features = spectrogram( + one_waveform * self.frame_signal_scale, + window=window, + frame_length=self.sample_size, + hop_length=self.sample_stride, + fft_length=self.n_fft, + center=False, + preemphasis=self.preemphasis_coeff, + mel_filters=fbanks, + mel_floor=self.mel_floor, + log_mel="log", + ) + return msfc_features.T + + def _normalize_one(self, x, input_length, padding_value): + # make sure we normalize float32 arrays + if self.normalize_means: + mean = x[:input_length].mean(axis=0) + x = np.subtract(x, mean) + if self.normalize_vars: + std = x[:input_length].std(axis=0) + x = np.divide(x, std) + + if input_length < x.shape[0]: + x[input_length:] = padding_value + + # make sure array is in float32 + x = x.astype(np.float32) + + return x + + def normalize( + self, input_features: list[np.ndarray], attention_mask: Optional[np.ndarray] = None + ) -> list[np.ndarray]: + lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features] + return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)] + + def __call__( + self, + raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + padding: Union[bool, str, PaddingStrategy] = False, + max_length: Optional[int] = None, + truncation: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + sampling_rate: Optional[int] = None, + **kwargs, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the + log-mel spectrogram of the input audio, as implemented in the original Flashlight MFSC feature extraction code. + + Args: + raw_speech (`torch.Tensor`, `np.ndarray`, `list[float]`, `list[torch.Tensor]`, `list[np.ndarray]`, `list[list[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list + of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be + mono channel audio, not stereo, i.e. single float per timestep. + padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + truncation (`bool`): + Activates truncation to cut input sequences longer than *max_length* to *max_length*. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific feature_extractor's default. + + [What are attention masks?](../glossary#attention-mask) + + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + padding_value (`float`, defaults to 0.0): + """ + + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of" + f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with" + f" {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + "It is strongly recommended to pass the ``sampling_rate`` argument to this function. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") + is_batched = is_batched_numpy or ( + isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) + ) + + if is_batched: + raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech, dtype=np.float32) + elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64): + raw_speech = raw_speech.astype(np.float32) + + # always return batch + if not is_batched: + raw_speech = [raw_speech] + + # extract fbank features + features = [self._extract_mfsc_features(one_waveform) for one_waveform in raw_speech] + + # convert into correct format for padding + encoded_inputs = BatchFeature({"input_features": features}) + + padded_inputs = self.pad( + encoded_inputs, + padding=padding, + max_length=max_length, + truncation=truncation, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=True, + **kwargs, + ) + # make sure list is in array format + input_features = padded_inputs.get("input_features") + if isinstance(input_features[0], list): + padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features] + + attention_mask = padded_inputs.get("attention_mask") + if attention_mask is not None: + padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask] + + if self.normalize_means or self.normalize_vars: + attention_mask = ( + np.array(attention_mask, dtype=np.int32) + if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD + and padding + else None + ) + padded_inputs["input_features"] = self.normalize( + padded_inputs["input_features"], attention_mask=attention_mask + ) + + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + return padded_inputs + + +__all__ = ["MCTCTFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py new file mode 100644 index 0000000000000000000000000000000000000000..16f59d3d1dfa11a11bd1ef51ad77e137adf2cc93 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py @@ -0,0 +1,778 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch M-CTC-T model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn + +from ....activations import ACT2FN +from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward +from ....integrations.deepspeed import is_deepspeed_zero3_enabled +from ....integrations.fsdp import is_fsdp_managed_module +from ....modeling_attn_mask_utils import _prepare_4d_attention_mask +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutput, CausalLMOutput +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import logging +from .configuration_mctct import MCTCTConfig + + +logger = logging.get_logger(__name__) + +_HIDDEN_STATES_START_POSITION = 1 + +_CONFIG_FOR_DOC = "MCTCTConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "speechbrain/m-ctc-t-large" +_EXPECTED_OUTPUT_SHAPE = [1, 195, 1536] + +# CTC docstring +_CTC_EXPECTED_OUTPUT = '"Mr. Quilter is the apostle of the middle classes, and we\'re glad to welcome his gospel."' +_CTC_EXPECTED_LOSS = 1885.65 + + +class MCTCTConv1dSubsampler(nn.Module): + """ + Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation + via gated linear units (https://huggingface.co/papers/1911.08460) + """ + + def __init__(self, config): + super().__init__() + self.config = config + self.glu_dim = config.conv_glu_dim + + self.dropout = nn.Dropout(config.conv_dropout) + + self.num_layers = config.num_conv_layers + self.in_channels = config.input_feat_per_channel * config.input_channels + + if self.num_layers > 1: + if config.conv_channels is None: + raise ValueError( + "Need to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution" + " layers." + ) + + self.mid_channels = config.conv_channels + else: + self.mid_channels = None + + self.out_channels = config.hidden_size * 2 # considering GLU halving + self.kernel_size = config.conv_kernel + self.stride = config.conv_stride + + # NOTE: MCTCT by construction only uses one convolution kernel. I've made this flexible to allow for + # multiple layers of convolutions, but not sure if this model definition should just restrict it + # to one layer. This becomes especially relevant when considering the padding like line 1 of forward(). + self.conv_layers = nn.ModuleList( + nn.Conv1d( + self.in_channels if i == 0 else self.mid_channels[i], + self.mid_channels[i] if i < self.num_layers - 1 else self.out_channels, + kernel_size=k, + stride=self.stride[i], + padding="valid", + ) + for i, k in enumerate(self.kernel_size) + ) + + def forward(self, input_features): + # NOTE: in reference to the NOTE in __init__, right now it just calculates padding as if + # there will be just one conv layer. + padding = sum(size // 2 for size in self.kernel_size) # (7, 7) -> (3, 3) + + input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0) + hidden_states = input_features.transpose(1, 2).contiguous() # -> Batch x Frame x Time + for conv in self.conv_layers: + hidden_states = conv(hidden_states) + hidden_states = nn.functional.glu(hidden_states, dim=self.glu_dim) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states.transpose(1, 2).contiguous() # -> Batch x Time x Frame + return hidden_states + + +class MCTCTEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.LayerNorm = MCTCTLayerNorm() + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", + torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), + persistent=False, + ) + + def forward( + self, input_features=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + input_shape = input_features.size() if input_features is not None else inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_features) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class MCTCTSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = config.attention_head_dim + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def reshape_fortran(self, x, shape): + if len(x.shape) > 0: + x = x.permute(*reversed(range(len(x.shape)))) + return x.reshape(*reversed(shape)).permute(*reversed(range(len(shape)))) + + def relative_position_embedding_rotate(self, scores): + # NOTE: should re-evaluate whether this re-implementation was truly necessary + # or the reason why my complete re-haul worked was due to some other part + # of the code. Adding this and the reshape fortrain code seems very undesirable. + scores = scores.permute(0, 2, 3, 1) # e.g. [10, 1839, 14, 4] + + batch, hidden_state, seq_len, heads = scores.shape + + # e.g. [10, 1853, 14, 4] + scores = torch.cat((scores, torch.zeros((batch, seq_len, seq_len, heads), device=scores.device)), dim=1) + + # e.g. [10, 25942, 1, 4] + scores = self.reshape_fortran(scores, [batch, (hidden_state + seq_len) * seq_len, 1, heads]) + + # e.g. [10, 25928, 1, 4] + scores = scores[:, : (seq_len + hidden_state - 1) * seq_len] + + # e.g. [10, 1852, 14, 4] + scores = self.reshape_fortran(scores, [batch, hidden_state + seq_len - 1, seq_len, heads]) + + halfpoint = hidden_state // 2 + scores = scores[:, halfpoint : halfpoint + seq_len].transpose(1, 2) # e.g. [10, 14, 14, 4] + + return scores.permute(0, 3, 1, 2) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + mixed_query_layer = mixed_query_layer / math.sqrt(self.attention_head_size) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + # relative key position embeddings + positional_embedding = self.distance_embedding.weight + relative_position_scores = torch.einsum("lh, bche -> bcle", positional_embedding, query_layer.transpose(2, 3)) + + relative_position_scores = self.relative_position_embedding_rotate(relative_position_scores) + attention_scores = attention_scores + relative_position_scores + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in MCTCTModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).flatten(start_dim=-2) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class MCTCTLayerNorm(nn.Module): + def __init__(self): + super().__init__() + self.singleton_weight = nn.Parameter(torch.ones(1)) + self.singleton_bias = nn.Parameter(torch.zeros(1)) + + def forward(self, hidden_states): + return (hidden_states * self.singleton_weight) + self.singleton_bias + + +class MCTCTSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class MCTCTAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = MCTCTSelfAttention(config) + self.output = MCTCTSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + + return outputs + + +class MCTCTIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class MCTCTOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class MCTCTLayer(GradientCheckpointingLayer): + def __init__(self, config: MCTCTConfig): + super().__init__() + + self.seq_len_dim = 1 + self.chunk_size_feed_forward = config.chunk_size_feed_forward + + self.intermediate = MCTCTIntermediate(config) + self.attention = MCTCTAttention(config) + self.is_decoder = config.is_decoder + self.output = MCTCTOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + output_attentions=False, + ): + self_attention_outputs = self.attention( + hidden_states, attention_mask, head_mask, output_attentions=output_attentions + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class MCTCTPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: MCTCTConfig + base_model_prefix = "mctct" + main_input_name = "input_features" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + std = self.config.initializer_range + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, MCTCTLayerNorm): + module.singleton_weight.data.fill_(1.0) + module.singleton_bias.data.zero_() + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + + def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): + """ + Computes the output length of the convolutional layers + """ + dilation = 1 + for _, kernel_sz, stride in zip( + range(self.config.num_conv_layers), self.config.conv_kernel, self.config.conv_stride + ): + padding = kernel_sz // 2 + input_lengths = input_lengths + 2 * padding - dilation * (kernel_sz - 1) - 1 + input_lengths = torch.div(input_lengths, stride, rounding_mode="trunc") + 1 + + return input_lengths + + def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask): + # generate creates 3D attention mask, because of the shape of input_features + # convert it to 2D if that's the case + if len(attention_mask.shape) > 2: + attention_mask = attention_mask[:, :, -1] + + # subsampled_lengths = attention_mask.sum(-1) + subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)) + bsz = attention_mask.size()[0] + attention_mask = torch.zeros( + (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device + ) + + # these two operations makes sure that all values + # before the output lengths indices are attended to + attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long() + return attention_mask + + +MCTCT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +MCTCT_INPUTS_DOCSTRING = r""" + Args: + input_features (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +class MCTCTEncoder(MCTCTPreTrainedModel): + def __init__(self, config: MCTCTConfig): + super().__init__(config) + self.hidden_dropout_prob = config.hidden_dropout_prob + + self.layer_norm = MCTCTLayerNorm() + self.conv = MCTCTConv1dSubsampler(config) + self.layers = nn.ModuleList([MCTCTLayer(config) for _ in range(config.num_hidden_layers)]) + + self.gradient_checkpointing = False + + def forward( + self, + input_features: torch.Tensor, + attention_mask: torch.Tensor, + head_mask: torch.Tensor, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + input_features = self.layer_norm(input_features) + + inputs_embeds = self.conv(input_features) + + # subsample attention mask if necessary + if attention_mask is not None: + attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask) + + hidden_states = nn.functional.dropout(inputs_embeds, p=self.hidden_dropout_prob, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, " + f"but it is for {head_mask.size()[0]}." + ) + + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.config.layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + layer_outputs = encoder_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +@add_start_docstrings( + "The bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.", + MCTCT_START_DOCSTRING, +) +class MCTCTModel(MCTCTPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.encoder = MCTCTEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + input_features: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_features is None: + raise ValueError("You have to specify input_features.") + + encoder_outputs = self.encoder( + input_features, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """MCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", + MCTCT_START_DOCSTRING, +) +class MCTCTForCTC(MCTCTPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.mctct = MCTCTModel(config) + + if config.vocab_size is None: + raise ValueError( + f"You are trying to instantiate {self.__class__} with a configuration that " + "does not define the vocabulary size of the language model head. Please " + "instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. " + "or define `vocab_size` of your model's configuration." + ) + output_hidden_size = config.hidden_size + + self.ctc_head = nn.Linear(output_hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_CTC_EXPECTED_OUTPUT, + expected_loss=_CTC_EXPECTED_LOSS, + ) + def forward( + self, + input_features: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[tuple, CausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. + """ + if labels is not None and labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.mctct( + input_features, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + logits = self.ctc_head(hidden_states) + + loss = None + if labels is not None: + # retrieve loss input_lengths from attention_mask + attention_mask = ( + attention_mask + if attention_mask is not None + else torch.ones(input_features.shape[:-1], dtype=torch.long) + ) + input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long) + # assuming that padded tokens are filled with -100 + # when not being attended to + labels_mask = labels >= 0 + target_lengths = labels_mask.sum(-1) + flattened_targets = labels.masked_select(labels_mask) + + # ctc_loss doesn't support fp16 + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) + + with torch.backends.cudnn.flags(enabled=False): + loss = nn.functional.ctc_loss( + log_probs, + flattened_targets, + input_lengths, + target_lengths, + blank=self.config.pad_token_id, + reduction=self.config.ctc_loss_reduction, + zero_infinity=self.config.ctc_zero_infinity, + ) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +__all__ = ["MCTCTForCTC", "MCTCTModel", "MCTCTPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/processing_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/processing_mctct.py new file mode 100644 index 0000000000000000000000000000000000000000..9661a5082d4838b4e60042be2ceb747c03e045ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/processing_mctct.py @@ -0,0 +1,139 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Speech processor class for M-CTC-T +""" + +import warnings +from contextlib import contextmanager + +from ....processing_utils import ProcessorMixin + + +class MCTCTProcessor(ProcessorMixin): + r""" + Constructs a MCTCT processor which wraps a MCTCT feature extractor and a MCTCT tokenizer into a single processor. + + [`MCTCTProcessor`] offers all the functionalities of [`MCTCTFeatureExtractor`] and [`AutoTokenizer`]. See the + [`~MCTCTProcessor.__call__`] and [`~MCTCTProcessor.decode`] for more information. + + Args: + feature_extractor (`MCTCTFeatureExtractor`): + An instance of [`MCTCTFeatureExtractor`]. The feature extractor is a required input. + tokenizer (`AutoTokenizer`): + An instance of [`AutoTokenizer`]. The tokenizer is a required input. + """ + + feature_extractor_class = "MCTCTFeatureExtractor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) + self.current_processor = self.feature_extractor + self._in_target_context_manager = False + + def __call__(self, *args, **kwargs): + """ + When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's + [`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context + [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's + [`~AutoTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information. + """ + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + sampling_rate = kwargs.pop("sampling_rate", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs + + def pad(self, *args, **kwargs): + """ + When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's + [`~MCTCTFeatureExtractor.pad`] and returns its output. If used in the context + [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's + [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information. + """ + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor.pad(*args, **kwargs) + + input_features = kwargs.pop("input_features", None) + labels = kwargs.pop("labels", None) + if len(args) > 0: + input_features = args[0] + args = args[1:] + + if input_features is not None: + input_features = self.feature_extractor.pad(input_features, *args, **kwargs) + if labels is not None: + labels = self.tokenizer.pad(labels, **kwargs) + + if labels is None: + return input_features + elif input_features is None: + return labels + else: + input_features["labels"] = labels["input_ids"] + return input_features + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to AutoTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the + docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @contextmanager + def as_target_processor(self): + """ + Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT. + """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True + self.current_processor = self.tokenizer + yield + self.current_processor = self.feature_extractor + self._in_target_context_manager = False + + +__all__ = ["MCTCTProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cff2c19505f9306c28edb5bdabb66f668363f5fa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mega import * + from .modeling_mega import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/configuration_mega.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/configuration_mega.py new file mode 100644 index 0000000000000000000000000000000000000000..0ede2cac66bf292c448b488d18af4d2e6e2c716a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/configuration_mega.py @@ -0,0 +1,243 @@ +# coding=utf-8 +# Copyright 2023 The Mega Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MEGA configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ....configuration_utils import PretrainedConfig +from ....onnx import OnnxConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class MegaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MegaModel`]. It is used to instantiate a Mega + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Mega + [mnaylor/mega-base-wikitext](https://huggingface.co/mnaylor/mega-base-wikitext) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Mega model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MegaModel`]. + hidden_size (`int`, *optional*, defaults to 128): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 4): + Number of hidden layers in the Mega encoder. + intermediate_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden size (self-attention value projection) within the Mega encoder + ema_projection_size (`int`, *optional*, defaults to 16): + Dimensionality of the MegaMultiDimensionDampedEma + bidirectional (`bool`, *optional*, defaults to `True`): + Whether the MegaMultiDimensionDampedEma used in Mega's self-attention should work bidirectionally (`True`) + or unidirectionally (`False`). Bidirectional EMA is incompatible with causal decoding, so this should be + False if you intend to use the model as a decoder. + shared_representation_size (`int`, *optional*, defaults to 64): + Dimensionality of the linear projection for shared representation of self-attention queries and keys + use_chunking (`bool`, *optional*, defaults to `False`): + Whether to chunk inputs for linear self-attention complexity (described as Mega-chunk in the paper) + chunk_size (`int`, *optional*, defaults to -1): + If `use_chunking` is set to `True`, determines the size of the chunks to apply to the input sequence. If + chunking is used, input sequences must be padded to a multiple of `chunk_size` + truncation (`int`, *optional*): + If specified, the sequence length for which to truncate MegaMultiDimensionDampedEma + normalize_before_mega (`bool`, *optional*, defaults to `True`): + Whether to normalize before (`True`) or after (`False`) passing through Mega encoder blocks + normalization_type (`str`, *optional*, defaults to `"scalenorm"`): + Type of normalization to use in Mega encoder blocks. Choose one of `"scalenorm"`, `"layernorm"`, + `"rmsnorm"`, `"batchnorm"`, or `"syncbatchnorm"` (GPU required for syncbatchnorm) + norm_affine (`bool`, *optional*, defaults to `True`): + If `True`, applies a parameterized affine transformation to inputs during normalization + activation (`str`, *optional*, defaults to `"silu"`): + Activation function to apply within Mega encoder blocks. Choose one of `"silu"`, `"relu"`, `"linear"`, + `"gelu"`, or `"gelu_accurate"` + attention_activation (`str`, *optional*, defaults to `"softmax"`): + Activation function to apply for single-headed self-attention (a la Transformer). Choose one of + `"softmax"`, `"laplace"`, or `"relu2"` + dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for EMA self-attention + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + use_feature_dropout (`bool`, *optional*, defaults to `False`): + Whether to use feature-based (`True`) or standard dropout (`False`) + use_normalized_ffn (`bool`, *optional*, defaults to `True`): + Whether to use the normalized feed-forward sub-layer in Mega blocks (`True`) or pass Mega encoder output + as-is (`False`) + nffn_hidden_size (`int`, *optional*, defaults to 256): + If using the normalized feed-forward network (NFFN) layer within Mega (`use_normalized_ffn = True`), this + is the hidden size of the NFFN + normalize_before_ffn (`bool`, *optional*, defaults to `True`): + Whether to normalize before (`True`) or after (`False`) the feed-forward portion of NFFN + nffn_activation_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the NFFN component. + max_positions (`int`, *optional*, defaults to 2048): + The maximum sequence length to use for positional representations. For `"simple"` relative positional bias, + this is a hard limit on input length; `"rotary"` relative positional bias will extrapolate to longer + sequences + add_token_type_embeddings (`bool`, *optional*, defaults to `True`): + Whether to account for token types in embeddings. Left as optional to maintain compatibility with original + implementation while adding support for token types. + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`MegaModel`]. Only used if + `add_token_type_embeddings = True` + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + ema_delta_alpha_range (`float`, *optional*, defaults to 0.2): + The standard deviation for initializing the delta (damping factor) and alpha (decay factor) parameters in + MegaMultiDimensionDampedEma. + ema_beta_range (`float`, *optional*, defaults to 0.02): + The standard deviation for initializing the beta parameter (expansion matrix) in + MegaMultiDimensionDampedEma. + ema_gamma_omega_range (`float`, *optional*, defaults to 1.0): + The standard deviation for initializing the gamma (projection matrix) and omega (residual weight) + parameters in MultiDimensionEMA. + relative_positional_bias (`str`, *optional*, defaults to `"rotary"`): + Type of relative positional encoding. Choose one of `"rotary"` or `"simple"`. If `"simple"` is selected, + `max_positions` is used as a limit on input size, while `"rotary"` extrapolates beyond `max_positions`. + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + add_lm_hidden_dense_layer (`bool`, *optional*, defaults to `True`): + Whether to include a hidden layer for projection between encoder outputs and LM heads (`True`) or pass + hidden states directly to LM head (`False`). Remains optional for compatibility with original + implementation + + Examples: + + ```python + >>> from transformers import MegaConfig, MegaModel + + >>> # Initializing a Mega configuration + >>> configuration = MegaConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = MegaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mega" + + def __init__( + self, + vocab_size=30522, + hidden_size=128, + num_hidden_layers=4, + intermediate_size=256, + ema_projection_size=16, + bidirectional=True, + shared_representation_size=64, + use_chunking=False, + chunk_size=-1, + truncation=None, + normalize_before_mega=True, + normalization_type="scalenorm", + norm_affine=True, + activation="silu", + attention_activation="softmax", + dropout_prob=0.1, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + use_feature_dropout=False, + use_normalized_ffn=True, + nffn_hidden_size=256, + normalize_before_ffn=True, + nffn_activation_dropout_prob=0.1, + max_positions=2048, + add_token_type_embeddings=False, + type_vocab_size=2, + initializer_range=0.02, + ema_delta_alpha_range=0.2, + ema_beta_range=0.02, + ema_gamma_omega_range=1.0, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + relative_positional_bias="rotary", + classifier_dropout=None, + use_cache=True, + add_lm_hidden_dense_layer=True, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.activation = activation + self.attention_activation = attention_activation + self.intermediate_size = intermediate_size + self.ema_projection_size = ema_projection_size + self.bidirectional = bidirectional + self.shared_representation_size = shared_representation_size + self.use_chunking = use_chunking + self.chunk_size = chunk_size + self.truncation = truncation + self.normalize_before_mega = normalize_before_mega + self.normalization_type = normalization_type + self.norm_affine = norm_affine + self.dropout_prob = dropout_prob + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.use_feature_dropout = use_feature_dropout + self.use_normalized_ffn = use_normalized_ffn + self.nffn_hidden_size = nffn_hidden_size + self.normalize_before_ffn = normalize_before_ffn + self.nffn_activation_dropout_prob = nffn_activation_dropout_prob + self.max_positions = max_positions + self.add_token_type_embeddings = add_token_type_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.ema_delta_alpha_range = ema_delta_alpha_range + self.ema_beta_range = ema_beta_range + self.ema_gamma_omega_range = ema_gamma_omega_range + self.relative_positional_bias = relative_positional_bias + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + self.add_lm_hidden_dense_layer = add_lm_hidden_dense_layer + self.num_attention_heads = 1 # not used but required by Hugging Face + + +class MegaOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["MegaConfig", "MegaOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/modeling_mega.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/modeling_mega.py new file mode 100644 index 0000000000000000000000000000000000000000..c237afee9a334d57d155b8efa2eb5de8278d21dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/modeling_mega.py @@ -0,0 +1,2276 @@ +# coding=utf-8 +# Copyright 2023 The Mega Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch MEGA model.""" + +import math +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_outputs import ( + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ....modeling_utils import PreTrainedModel +from ....utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ....utils.deprecation import deprecate_kwarg +from .configuration_mega import MegaConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "mnaylor/mega-base-wikitext" +_CONFIG_FOR_DOC = "MegaConfig" + + +class MegaEmbeddings(nn.Module): + """ + Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of + RoBERTa's embeddings which optionally includes token types + """ + + def __init__(self, config: MegaConfig): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.use_token_types = config.add_token_type_embeddings + if self.use_token_types: + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + # registering a buffer here allows model tracing when not passing optional token type IDs + # more info at transformers issue #5664 + self.register_buffer( + "token_type_ids", torch.zeros(config.max_positions, dtype=torch.long).expand((1, -1)), persistent=False + ) + + self.padding_idx = config.pad_token_id + + def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None): + if (input_ids is None) and (inputs_embeds is None): + raise ValueError("Must provide one of input_ids or inputs_embeds") + elif input_ids is not None: + input_shape = input_ids.size() + device = input_ids.device + + # get the word embeddings if only IDs are provided + inputs_embeds = self.word_embeddings(input_ids) + else: + input_shape = inputs_embeds.size()[:-1] + device = inputs_embeds.device + + # the original Mega implementation did not include token type embeddings, so we add + # an option to use them if desired; if embeddings are present and token type IDs are + # not provided, we will use a registered buffer (which helps with tracing) + if self.use_token_types: + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, : input_shape[1]] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], input_shape[1]) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # access token type embeddings + token_type_embeddings = self.token_type_embeddings(token_type_ids) + # add the token type embeddings to the word embeddings + embeddings = inputs_embeds + token_type_embeddings + else: + embeddings = inputs_embeds + return embeddings + + +class MegaSimpleRelativePositionalBias(nn.Module): + """ + Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability + """ + + def __init__(self, config: MegaConfig): + super().__init__() + self.config = config + self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size + self.rel_pos_bias = nn.Parameter(torch.Tensor(2 * config.max_positions - 1)) + + def forward(self, seq_len): + if seq_len > self.max_positions: + raise ValueError(f"Sequence length {seq_len} going beyond max length {self.max_positions}") + + # seq_len * 2 - 1 + bias = self.rel_pos_bias[(self.max_positions - seq_len) : (self.max_positions + seq_len - 1)] + # seq_len * 3 - 1 + tile = F.pad(bias, (0, seq_len)) + # (seq_len * 3 - 1) * seq_len + tile = torch.tile(tile, (seq_len,)) + tile = tile[:-seq_len] + # seq_len x (3 * seq_len - 2) + tile = tile.view(seq_len, 3 * seq_len - 2) + start = (2 * seq_len - 1) // 2 + end = tile.size(1) - start + tile = tile[:, start:end] + return tile + + +class MegaRotaryRelativePositionalBias(nn.Module): + """ + Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega + repo due to differences in implementation. + + When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can + extrapolate to longer sequences. Can be indexed according to input position IDs + """ + + def __init__(self, config: MegaConfig): + super().__init__() + if config.hidden_size % 2 != 0: + raise RuntimeError("Rotary positional bias requires `hidden_size` to be a multiple of 2") + self.config = config + self.embed_dim = config.shared_representation_size + self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size + self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings( + config.max_positions, self.embed_dim + ) + # alpha and beta parameters for the rotary bias; beta renamed to b_param to avoid clashes with tf/flax weight handling + # in loading pretrained weights + self.alpha = nn.Parameter(torch.Tensor(1, self.embed_dim)) + self.b_param = nn.Parameter(torch.Tensor(1, self.embed_dim)) + self.register_buffer("_float_tensor", torch.FloatTensor([0.0])) + + @staticmethod + def get_sinusoid_embeddings(max_positions: int, embedding_dim: int): + half_dim = embedding_dim // 2 + emb = math.log(10000) / half_dim + emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb) + emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) + return torch.sin(emb), torch.cos(emb) + + def rotary(self, input): + seq_len, embed_dim = input.size() + chunk_1, chunk_2 = torch.chunk(input, 2, dim=-1) + if self.sine is None or seq_len > self.sine.size(0): + self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(seq_len, embed_dim) + self.max_positions = seq_len + self.sine = self.sine.to(self._float_tensor) + self.cosine = self.cosine.to(self._float_tensor) + + sin = self.sine[:seq_len] + cos = self.cosine[:seq_len] + return torch.cat([chunk_1 * cos - chunk_2 * sin, chunk_2 * cos + chunk_1 * sin], dim=1) + + def forward(self, seq_len): + rotary_alpha = self.rotary(self.alpha.expand(seq_len, self.embed_dim)) + rotary_beta = self.rotary(self.b_param.expand(seq_len, self.embed_dim)) + bias = torch.einsum("mk,nk->mn", rotary_alpha, rotary_beta) + return bias + + +class MegaDropout(nn.Module): + """ + A unified class for standard dropout functionality and featurewise dropout. + + The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic + and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which + is retained here as well. + """ + + def __init__(self, dropout_probability, is_featurewise=False): + super().__init__() + self.dropout_probability = dropout_probability + self.is_featurewise = is_featurewise + + def forward(self, input, batch_first: bool = False): + if self.is_featurewise: + if batch_first: + # (batch_size X sequence_length X feature_dimension) + # -> (batch_size X feature_dimension X sequence_length) + # -> (batch_size X sequence_length X feature_dimension) + return F.dropout2d( + input.transpose(-1, -2), p=self.dropout_probability, training=self.training + ).transpose(-1, -2) + else: + if input.dim() != 3: + raise ValueError( + "Feature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]" + ) + # (sequence_length X batch_size X feature_dimension) + # -> (batch_size X feature_dimension X sequence_length) + # -> (sequence_length X batch_size X feature_dimension) + return F.dropout2d(input.permute(1, 2, 0), p=self.dropout_probability, training=self.training).permute( + 2, 0, 1 + ) + else: + return F.dropout(input, p=self.dropout_probability, training=self.training) + + +class MegaRMSNorm(nn.Module): + """ + RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square + root (as opposed to after in T5) + """ + + def __init__(self, number_features, eps=1e-6, affine=True): + super().__init__() + self.num_features = number_features + self.eps = eps + self.affine = affine + if affine: + self.weight = nn.Parameter(torch.Tensor(self.num_features)) + else: + self.register_parameter("weight", None) + + def forward(self, input): + mean_square = torch.mean(torch.square(input), dim=-1, keepdim=True) + if self.weight is not None: + input = input * self.weight + + input * torch.rsqrt(mean_square + self.eps) + return input + + def extra_repr(self): + return f"{self.num_features}, eps={self.eps}, affine={self.affine}" + + +class MegaScaleNorm(nn.Module): + """ + Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar + multiplication instead of a vector, and applies over a specified dimension + """ + + def __init__(self, dim, eps=1e-6, affine=True): + super().__init__() + self.dim = dim + self.eps = eps + self.affine = affine + if affine: + self.scalar = nn.Parameter(torch.Tensor(1)) + else: + self.register_parameter("scalar", None) + + def forward(self, input): + mean_square = torch.mean(torch.square(input), dim=self.dim, keepdim=True) + if self.scalar is not None: + input = self.scalar * input + + output = input * torch.rsqrt(mean_square + self.eps) + return output + + +class MegaSequenceNorm(nn.Module): + """ + A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on + input axis locations for different normalization methods. + """ + + def __init__(self, norm_type, embedding_dim, eps=1e-5, affine=True, export=False): + super().__init__() + if norm_type == "layernorm": + self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine=affine) + elif norm_type == "scalenorm": + self.norm = MegaScaleNorm(dim=-1, eps=eps, affine=affine) + elif norm_type == "rmsnorm": + self.norm = MegaRMSNorm(embedding_dim, eps=eps, affine=affine) + elif norm_type == "batchnorm": + self.norm = nn.BatchNorm1d(embedding_dim, eps=eps, affine=affine) + elif norm_type == "syncbatchnorm": + self.norm = nn.SyncBatchNorm(embedding_dim, eps=eps, affine=affine) + else: + raise ValueError(f"Unknown norm type: {norm_type}") + + def forward(self, input): + if isinstance(self.norm, nn.modules.batchnorm._BatchNorm): + if input.dim() != 3: + raise ValueError("BatchNorm inputs must be exactly 3-dimensional") + input = input.permute(1, 2, 0) + input = self.norm(input) + return input.permute(2, 0, 1) + else: + return self.norm(input) + + +class MegaMultiDimensionDampedEma(nn.Module): + """ + Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of + variable names and moving away from the stateful representation of incremental decoding state. See + "https://huggingface.co/papers/2209.10655" for more details. + """ + + def __init__(self, config: MegaConfig): + super().__init__() + + self.config = config + + self.embed_dim = config.hidden_size + self.ndim = config.ema_projection_size + self.bidirectional = config.bidirectional + self.truncation = config.truncation + self.scale = math.sqrt(1.0 / self.ndim) + + kernel_dim = 2 * config.hidden_size if self.bidirectional else config.hidden_size + # renamed delta (damping_factor) and alpha (decay_factor) to be more descriptive of what the parameters are doing + self.damping_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1)) + self.decay_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1)) + # renamed gamma (kernel_projection_matrix) and beta (ema_expansion_matrix) respectively to avoid HF renaming + # things and align with the paper's description of these params' behavior + self.ema_expansion_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1)) + self.kernel_projection_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim)) + # renamed omega to residual_weight to describe what it's doing + self.residual_weight = nn.Parameter(torch.Tensor(config.hidden_size)) + self._kernel = None + self._coeffs = None + + def _compute_ema_coefficients(self): + self._coeffs = None + # convert the alpha and delta parameters (kernel_dim x EMA projection size x 1) to [0, 1] with sigmoid + damping_factor = torch.sigmoid(self.damping_factor) + decay_factor = torch.sigmoid(self.decay_factor) + previous_timestep_weight = 1.0 - damping_factor * decay_factor + return damping_factor, previous_timestep_weight + + def _compute_efficient_ema_kernel(self, length: int): + # computes the kernel used for efficient damped EMA applied via FFT convolution + self._kernel = None + # p and q have shape (kernel_dim x ema_projection_size x 1) + damping_factor, previous_timestep_weight = self._compute_ema_coefficients() + # extend the kernel to (kernel_dim X ema_projection_size X sequence_length) and + # multiply q by sequential ints up to the sequence length + vander = torch.arange(length).to(damping_factor).view(1, 1, length) * torch.log(previous_timestep_weight) + kernel = (damping_factor * self.ema_expansion_matrix) * torch.exp(vander) + # (kernel_dim X ema_projection_size X sequence_length) -> (kernel_dim, sequence_length) + return torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale) + + def get_ema_coefficients(self): + if self.training: + return self._compute_ema_coefficients() + else: + if self._coeffs is None: + self._coeffs = self._compute_ema_coefficients() + return self._coeffs + + def get_ema_kernel(self, length: int): + kernel_size = length if self.truncation is None else min(self.truncation, length) + if self.training: + return self._compute_efficient_ema_kernel(kernel_size) + else: + if self._kernel is None or self._kernel.size(-1) < kernel_size: + self._kernel = self._compute_efficient_ema_kernel(kernel_size) + return self._kernel[..., :kernel_size] + + def fft_convolution(self, inputs, kernel, length): + # this is a wrapper for repeated use of EMA calculation via FFT (fast Fourier transform) convolution + inputs_fft = torch.fft.rfft(inputs.float(), n=2 * length) + kernel_fft = torch.fft.rfft(kernel.float(), n=2 * length) + convolved_sequence = torch.fft.irfft(inputs_fft * kernel_fft, n=2 * length) + return convolved_sequence + + def ema_step(self, inputs, length, past_state=None): + if length == 1: + return self.one_ema_step(inputs, past_state=past_state) + + # (kernel_dim X ema_projection_size X 1) + damping_factor, previous_timestep_weight = self.get_ema_coefficients() + # (kernel_dim X ema_projection_size X 1+sequence_length) + vander = torch.arange(length + 1).to(damping_factor).view(1, 1, length + 1) * torch.log( + previous_timestep_weight + ) + vander = torch.exp(vander) + if past_state is not None: + # (kernel_dim X ema_projection_size X sequence_length) * (kernel_dim X ema_projection_size X 1) + # -> (kernel_dim X ema_projection_size X sequence_length) + past_ema_proj = vander[:, :, 1:] * (self.kernel_projection_matrix * self.scale).unsqueeze(-1) + # past_state will be (batch_size, kernel_dim, ema_projection_size) + past_ema_state = torch.einsum("bdn,dnl->bdl", past_state, past_ema_proj) + # (kernel_dim X ema_projection_size) * (batch_size X kernel_dim X ema_projection_size) + # -> (batch_size X kernel_dim X ema_projection_size) + past_vandermonde = vander[:, :, -1] * past_state + else: + past_ema_state = None + past_vandermonde = None + + # (kernel_dim X ema_projection_size X sequence_length) + vander = vander[:, :, :-1] + kernel = (damping_factor * self.ema_expansion_matrix) * vander + kernel_proj = torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale) + + ema_output = self.fft_convolution(inputs, kernel_proj, length=length)[..., 0:length] + ema_output = ema_output.type_as(inputs) + if past_ema_state is not None: + ema_output = ema_output + past_ema_state + + updated_hidden_state = torch.einsum("bdl,dnl->bdn", inputs, torch.flip(kernel, dims=[2])) + if past_vandermonde is not None: + updated_hidden_state = updated_hidden_state + past_vandermonde + # return a tuple: + # (sequence_length, batch_size, kernel_dim) + # (batch_size, kernel_dim, ema_projection_size) + return ema_output.permute(2, 0, 1), updated_hidden_state + + def one_ema_step(self, inputs, past_state=None): + damping_factor, previous_timestep_weight = self.get_ema_coefficients() + # (kernel_dim X ema_projection_size) x (batch_size X kernel_dim X 1) + # -> (batch_size X kernel_dim X ema_projection_size) + updated_state = (damping_factor * self.ema_expansion_matrix).squeeze(-1) * inputs + if past_state is not None: + updated_state = updated_state + previous_timestep_weight.squeeze(-1) * past_state + # (batch_size X kernel_dim) + out = torch.einsum("bdn,dn->bd", updated_state, self.kernel_projection_matrix * self.scale) + # (1 X batch_size X kernel_dim), (batch_size X kernel_dim X ema_projection_size) + return out.unsqueeze(0), updated_state + + def forward( + self, + inputs, + attention_mask: Optional[torch.Tensor] = None, + prev_state: Optional[torch.Tensor] = None, + use_cache: bool = False, + ) -> torch.Tensor: + """ + Mega's exponential moving average (EMA) sub-layer applied prior to single-headed (traditional) self-attention + + Args: + inputs (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`): + Hidden state / embedding input to update via EMA based on FFT convolution + attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indicates which inputs are to be ignored (mostly due to padding), where elements are either 1 for *not + masked* or 0 for *masked* + prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*): + The hidden state returned from the previous timestep during incremental decoding. + use_cache (`bool`, default `False`): + Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the + updated EMA hidden state for use in the next step + + Returns: + `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and + inputs: + - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden + states updated by EMA, with same shapes as inputs + - **updated_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor of shape `(batch_size, + config.ndim)` -- The incremental EMA state for use in the next step of incremental decoding + """ + + seq_len, bsz, embed_dim = inputs.size() + if embed_dim != self.embed_dim: + raise ValueError( + f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}" + ) + + # sequence_length X batch_size X hidden_size + residual = inputs * self.residual_weight + + # (sequence_length x batch_size x hidden_size) -> (batch_size x hidden_size x sequence_length) + inputs = inputs.permute(1, 2, 0) + # mask the input: output is a tensor with 0 in the masked positions + if attention_mask is not None: + inputs = inputs * (attention_mask.unsqueeze(1).type_as(inputs)) + + if self.bidirectional and use_cache: + raise RuntimeError("Bidirectional EMA does not support incremental state") + + if use_cache: + out, updated_state = self.ema_step(inputs, seq_len, past_state=prev_state) + + # (batch_size X hidden_size) -> (1 x batch_size x hidden_size) + out = F.silu(out + residual) + + # if incremental decoding, return the new state along with the output + return out, updated_state + else: + # (hidden_size x sequence_length) + kernel = self.get_ema_kernel(seq_len) + fft_len = seq_len + s_index = 0 + kernel_size = kernel.size(1) + if self.bidirectional: + # split the kernel for each direction of EMA + k1, k2 = torch.split(kernel, [self.embed_dim, self.embed_dim], dim=0) + # (hidden_size X 2*sequence_length - 1) + kernel = F.pad(k1, (kernel_size - 1, 0)) + F.pad(k2.flip(-1), (0, kernel_size - 1)) + inputs = F.pad(inputs, (kernel_size - 1, 0)) + fft_len = fft_len + kernel_size - 1 + s_index = 2 * kernel_size - 2 + + ema_output = self.fft_convolution(inputs, kernel, length=fft_len)[..., s_index : s_index + seq_len] + ema_output = ema_output.type_as(inputs) + # (batch_size X hidden_size X sequence_length) -> (sequence_length X batch_size X hidden_size) + gated_ema_output = F.silu(ema_output.permute(2, 0, 1) + residual) + + return gated_ema_output, None + + +class MegaGatedCrossAttention(nn.Module): + """ + Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only + modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and + `static_kv` arguments, and the stateful representation of incremental decoder state. + """ + + def __init__(self, config: MegaConfig): + super().__init__() + + self.config = config + self.activation = ACT2FN[self.config.activation] + self.attention_activation = self.config.attention_activation + self.scaling = self.config.shared_representation_size**-0.5 if self.attention_activation == "softmax" else None + + self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout) + self.hidden_dropout = MegaDropout( + self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout + ) + # Attention dropout is standard dropout + self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False) + + self.prenorm = self.config.normalize_before_mega + self.norm = MegaSequenceNorm( + self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine + ) + + self.k_proj = nn.Linear(self.config.hidden_size, self.config.shared_representation_size) + self.v_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size) + self.q_proj = nn.Linear( + self.config.hidden_size, 2 * self.config.hidden_size + self.config.shared_representation_size + ) + self.h_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size) + + if self.config.relative_positional_bias == "simple": + self.rel_pos_bias = MegaSimpleRelativePositionalBias(config) + elif self.config.relative_positional_bias == "rotary": + self.rel_pos_bias = MegaRotaryRelativePositionalBias(config) + else: + raise ValueError(f"unknown relative position bias: {self.config.relative_positional_bias}") + + self.softmax = nn.Softmax(dim=-1) + + def element_attention(self, query, key, key_padding_mask, pidx): + bsz, src_len, _ = key.size() + tgt_len = query.size(1) if pidx is None else pidx + 1 + if key_padding_mask is not None: + # (batch_size X source_sequence_length) --> (batch_size X 1 X 1) + lengths = key_padding_mask.sum(dim=-1).view(bsz, 1, 1) + else: + lengths = src_len + + # (target_sequence_length X source_sequence_length) + bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len] + if pidx is not None: + if query.size(1) != 1: + raise ValueError("Position offset provided with queries longer than 1 token") + # source_sequence_length + bias = bias[pidx] + else: + # (target_sequence_length X source_sequence_length) + bias = bias[:tgt_len] + + # (batch_size X target_sequence_length X source_sequence_length) + qk = torch.bmm(query, key.transpose(1, 2)) / lengths + bias + + attn_weights = ACT2FN[self.attention_activation](qk).type_as(qk) + + if key_padding_mask is not None: + attn_weights = attn_weights * key_padding_mask.unsqueeze(1) + + return attn_weights + + def softmax_attention(self, query, key, key_padding_mask, pidx): + bsz, src_len, _ = key.size() + tgt_len = query.size(1) if pidx is None else pidx + 1 + + # (target_sequence_length X source_sequence_length) + bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len] + if pidx is not None: + if query.size(1) != 1: + raise ValueError("Position offset provided with queries longer than 1 token") + # source_sequence_length + bias = bias[pidx] + else: + # (target_sequence_length X source_sequence_length) + bias = bias[:tgt_len] + + # scaled attention + query = query * self.scaling + # (batch_size X target_sequence_length X source_sequence_length) + qk = torch.bmm(query, key.transpose(1, 2)) + bias + + if key_padding_mask is not None: + qk = qk.masked_fill((1 - key_padding_mask).unsqueeze(1).to(torch.bool), float("-inf")) + + attn_weights = self.softmax(qk).type_as(qk) + return attn_weights + + def forward( + self, + query, + key: Optional[torch.Tensor], + value: Optional[torch.Tensor], + key_padding_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + Gated cross-attention used in Mega + + Args: + query (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`): + The self (or target) sequence input used as query inputs for cross-attention + key (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`): + The cross (or source) sequence input with shape used as keys in cross-attention + value (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`): + The cross (or source) sequence input with shape used as values in cross-attention + key_padding_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*): + Padding mask corresponding to the source sequence, where entries are 1 for *not masked* and 0 for + *masked* tokens + past_key_values (`tuple(torch.FloatTensor)`, *optional*): + If provided, the hidden state returned from the previous timestep during incremental decoding; expects + that prior cross-attention keys and values will be the last two items in the tuple + output_attentions (`bool`, defaults to `False`): + Whether or not to return the cross-attention weights. + use_cache (`bool`, defaults to `False`): + Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the + updated EMA hidden state for use in the next step + + Returns: + `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and + inputs: + - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) -- + Hidden states from target sequence updated by gated cross-attention + - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape + `(batch_size, source_sequence_length, target_sequence_length)` -- The pairwise cross-attention weights + corresponding to each token in the source and target sequences + - **cross_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size, + source_sequence_length, config.shared_representation_size)` -- The cross-attention key state for use in + the next step of incremental decoding + - **cross_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size, + source_sequence_length, config.hidden_size)` -- The cross-attention value state for use in the next step + of incremental decoding + """ + + seq_len, bsz, embed_dim = query.size() + if embed_dim != self.config.hidden_size: + raise ValueError( + f"Unexpected embedding dimension received: input is {embed_dim} but expected {self.config.hidden_size}" + ) + + if past_key_values is not None: + # make sure the inputs only have a sequence length of 1 if we're doing incremental decoding + if seq_len != 1: + raise ValueError(f"Incremental decoding requested with self-sequence length > 1: {seq_len}") + # expect past_key_values to have (self_key, self_value, self_ema, cross_key, cross_value) + prev_cross_key, prev_cross_value = past_key_values[-2:] + key = value = None + + # use the self-attention cache to get the position id of the current step + prev_self_key = past_key_values[0] + num_incremental_steps = prev_self_key.size(1) + 1 + else: + prev_cross_key = prev_cross_value = None + # we still need the position id if we're doing incremental decoding (past_key_values will be None for the first step) + num_incremental_steps = 0 if use_cache and (seq_len == 1) else None + + full_query = query + if self.prenorm: + full_query = self.norm(full_query) + + # (target_sequence_length X batch_size X 2*hidden_size + shared_representation_size) + query_projected = self.q_proj(full_query) + # split the query projections into separate components + # - residual_weight is passed through sigmoid and sent through elementwise multiplication to the gated/weighted targets prior to being added to the query directly + # - target_gate is a silu-gated tensor that is multiplied by the attention-weighted target below prior to residual connection + # - attention_query is the part that is passed to the attention function + residual_weight, target_gate, attention_query = torch.split( + query_projected, + [self.config.hidden_size, self.config.hidden_size, self.config.shared_representation_size], + dim=-1, + ) + + # (target_sequence_length X batch_size X hidden_size) + residual_weight = torch.sigmoid(residual_weight) + target_gate = F.silu(target_gate) + + if key is None: + if value is not None: + raise ValueError("Key and value must be `None` simultaneously") + projected_key = projected_value = None + else: + # (source_sequence_length X batch_size X shared_representation_size) + projected_key = self.k_proj(key) + # (source_sequence_length X batch_size X hidden_size) + projected_value = self.activation(self.v_proj(key)) + + # (target_sequence_length X batch_size X shared_representation_size) + # -> (batch_size X target_sequence_length X shared_representation_size) + attention_query = attention_query.transpose(0, 1) + if projected_key is not None: + projected_key = projected_key.transpose(0, 1) + if projected_value is not None: + projected_value = projected_value.transpose(0, 1) + + # if we're doing incremental decoding, k and v are None and need to be overwritten with past values + if past_key_values is not None: + projected_key = prev_cross_key + projected_value = prev_cross_value + + # if we're returning the cache for later use, store these now for later return (can be done without having past_key_values provided) + if use_cache: + updated_cross_key = projected_key + updated_cross_value = projected_value + + ctx_len = projected_key.size(1) + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.dim() == 0: + key_padding_mask = None + + if key_padding_mask is not None: + if key_padding_mask.size(0) != bsz: + raise ValueError("Key padding mask does not align on the batch dimension") + if key_padding_mask.size(1) != ctx_len: + raise ValueError("Key padding mask does not align on the sequence length dimension") + + if self.attention_activation == "softmax": + attn_weights = self.softmax_attention( + attention_query, projected_key, key_padding_mask, num_incremental_steps + ) + else: + attn_weights = self.element_attention( + attention_query, projected_key, key_padding_mask, num_incremental_steps + ) + + projected_value = self.hidden_dropout(projected_value, batch_first=True) + kernel = self.attention_dropout(attn_weights) + # (batch_size X target_sequence_length X hidden_size) + # -> (target_sequence_length X batch_size X hidden_size) + weighted_targets = torch.bmm(kernel, projected_value).transpose(0, 1) + # (target_sequence_length X batch_size X hidden_size) + weighted_targets = self.activation(self.h_proj(weighted_targets * target_gate)) + weighted_targets = self.dropout(weighted_targets) + out = torch.addcmul(query, residual_weight, weighted_targets - query) + + if not self.prenorm: + out = self.norm(out) + + outputs = (out, attn_weights) if output_attentions else (out,) + if use_cache: + outputs = outputs + (updated_cross_key, updated_cross_value) + + return outputs + + +class MegaMovingAverageGatedAttention(nn.Module): + """ + Pure PyTorch implementation of Mega block; see https://huggingface.co/papers/2209.10655 and original fairseq implementation + at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License) + + Differences from original implementation include hidden state refactor and fixed inconsistency with additive / + multiplicative attention masks + """ + + def __init__(self, config: MegaConfig): + super().__init__() + self.config = config + self.activation = ACT2FN[self.config.activation] + self.scaling = ( + self.config.shared_representation_size**-0.5 if self.config.attention_activation == "softmax" else None + ) + self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout) + self.hidden_dropout = MegaDropout( + self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout + ) + # attention dropout is standard dropout + self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False) + + self.norm = MegaSequenceNorm( + self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine + ) + self.ema_gate = MegaMultiDimensionDampedEma(config) + + self.v_proj = nn.Linear(self.config.hidden_size, self.config.intermediate_size) + self.mx_proj = nn.Linear( + self.config.hidden_size, + self.config.shared_representation_size + self.config.intermediate_size + 2 * self.config.hidden_size, + ) + self.h_proj = nn.Linear(self.config.intermediate_size, self.config.hidden_size) + + self.qk_weight = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size)) + self.qk_bias = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size)) + + if self.config.relative_positional_bias == "simple": + self.rel_pos_bias = MegaSimpleRelativePositionalBias(config) + elif self.config.relative_positional_bias == "rotary": + self.rel_pos_bias = MegaRotaryRelativePositionalBias(config) + else: + raise ValueError(f"Unknown relative positional bias: {self.config.relative_positional_bias}") + + self.softmax = nn.Softmax(dim=-1) + self.attention_function = ( + self.softmax_attention if self.config.attention_activation == "softmax" else self.element_attention + ) + + def element_attention(self, query, key, padding_mask, causal_mask): + """ + Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized + causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for + masked. + """ + seq_len = key.size(2) + if padding_mask is not None: + # (batch_size X number of chunks X 1) + lengths = padding_mask.sum(-1, keepdim=True) + # (batch_size X number of chunks X 1 X 1) + lengths = lengths.clamp(min=1.0).unsqueeze(-1) + else: + lengths = seq_len + + if causal_mask is not None: + lengths = causal_mask.sum(dim=-1, keepdim=True) + + # (sequence_length X sequence_length) + bias = self.rel_pos_bias(seq_len) + if seq_len != query.size(2): + if query.size(2) != 1: + raise ValueError("Size mismatch between Q and K in element attention") + # (1 X sequence_length) + bias = bias[-1:] + + # (batch_size X number of chunks X sequence_length X sequence_length) + qk = torch.matmul(query, key.transpose(2, 3)) / lengths + bias + + attn_weights = ACT2FN[self.config.attention_activation](qk).type_as(qk) + + if padding_mask is not None: + attn_weights = attn_weights * padding_mask.unsqueeze(2) + + if causal_mask is not None: + attn_weights = attn_weights * causal_mask + + return attn_weights + + def softmax_attention(self, query, key, padding_mask, causal_mask): + "Standard softmax self-attention, as in the original Transformer paper" + seq_len = key.size(2) + # (sequence_length X sequence_length) + bias = self.rel_pos_bias(seq_len) + if seq_len != query.size(2): + if query.size(2) != 1: + raise ValueError("Size mismatch between Q and K in softmax attention") + # (1 X sequence_length) + bias = bias[-1:] + + # scaled attention + query = query * self.scaling + + # (batch_size x number of chunks x chunk_size x chunk_size) if chunking + # (batch_size x 1 x sequence_length x sequence_length) otherwise + qk = torch.matmul(query, key.transpose(2, 3)) + bias + + # apply causal mask (presumed to be 1/0 for not masked / masked) + # additive, but convert to 0/-inf (which is not explicitly in the Mega source code) + if causal_mask is not None: + additive_causal_mask = torch.zeros_like(causal_mask, dtype=qk.dtype) + additive_causal_mask = additive_causal_mask.masked_fill((1 - causal_mask).bool(), float("-inf")) + qk = qk + additive_causal_mask + + if padding_mask is not None: + # 1 for tokens which are *not masked* + # 0 for tokens which are *masked* + # replace masked tokens with -inf to make softmax ignore them + # need to invert the padding mask to match what mega original did + padding_mask = 1 - padding_mask + padding_mask_all = padding_mask.all(dim=-1, keepdim=True) + padding_mask = torch.logical_and(padding_mask, ~padding_mask_all) + qk = qk.masked_fill(padding_mask.unsqueeze(2).to(torch.bool), float("-inf")) + + attn_weights = self.softmax(qk).type_as(qk) + return attn_weights + + def forward( + self, + input, + padding_mask: Optional[torch.Tensor] = None, + causal_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions=False, + use_cache=False, + ): + """ + Mega's self-attention block, which combines multi-headed EMA with traditional self-attention + + Args: + input (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`): + Hidden states to be updated by Mega's self-attention + padding_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* + or 0 for *masked* + causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*): + Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not + masked* or 0 for *masked* + past_key_values (`tuple(torch.Tensor)`, *optional*): + The hidden states returned from the previous timestep during incremental decoding; expects that + self-attention key, value, and EMA states are the first 3 entries in the tuple + output_attentions (`bool`, default `False`): + Whether to return self-attention weights + use_cache (`bool`, default `False`): + Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated + states for use in the next step + + Returns: + `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and + inputs: + - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden + states from target sequence updated by Mega's self-attention + - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape + `(batch_size, 1, sequence_length, sequence_length)` -- The self-attention weights corresponding to how + each token in the input sequence attends to every other token + - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size, + sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next + step of incremental decoding + - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size, + sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of + incremental decoding + - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape + `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding. + """ + + seq_len, bsz, embed_dim = input.size() + if embed_dim != self.config.hidden_size: + raise ValueError(f"Input embedding dimension should be {self.config.hidden_size}; received {embed_dim}") + + # store inputs for residual connection and handle pre-norm if requested + residual = input + if self.config.normalize_before_mega: + input = self.norm(input) + + # (sequence_length X batch_size X hidden_size) -> (sequence_length X batch_size X intermediate_size) + value = self.activation(self.v_proj(input)) + + # unpack the incremental state if provided + # assumed to be (self K, self V, self EMA state, cross K, cross V) + # also assumes that incremental decoding is working one token at a time, so input sequence length must be 1 + if self.config.is_decoder and (past_key_values is not None): + if seq_len > 1: + raise ValueError(f"Incremental decoding only supports self sequence length of 1; received {seq_len}") + # the first 3 items in the saved states will be these regardless of whether cross-attention is present + prev_self_key, prev_self_value, prev_ema_state = past_key_values[0:3] + else: + prev_self_key = prev_self_value = prev_ema_state = None + + # ema output is (sequence_length x batch_size x hidden_size) + # updated_ema_state will be None if use_cache=False; otherwise (batch_size, config.ndim) + ema_out, updated_ema_state = self.ema_gate( + input, attention_mask=padding_mask, prev_state=prev_ema_state, use_cache=use_cache + ) + ema_out = self.dropout(ema_out) + + # (sequence_length X batch_size X hidden_size) + # -> (sequence_length X batch_size X 2*hidden_size + config.shared_representation_size + config.intermediate_size) + # - residual_weight -> sigmoid -> applied to residual connection in torch.addcmul + # - query_key_gates -> split into two components: query_key becomes query and key for attention input, gates becomes gating for self-attention output + # - intermediate_state -> added to weighted attention output, sent through activation, and has inputs subtracted during + # torch.addcmul to create the final layer output + base = self.mx_proj(ema_out) + residual_weight, query_key_gates, intermediate_state = torch.split( + base, + [ + self.config.hidden_size, + self.config.shared_representation_size + self.config.intermediate_size, + self.config.hidden_size, + ], + dim=-1, + ) + + # (sequence_length X batch_size X hidden_size) + residual_weight = torch.sigmoid(residual_weight) + + # (sequence_length X batch_size X shared_representation_size + intermediate_size) + query_key_gates = F.silu(query_key_gates) + + # split into two different tensors: one for Q/K usage and the other for gating self-attention + query_key, attention_gate = torch.split( + query_key_gates, [self.config.shared_representation_size, self.config.intermediate_size], dim=-1 + ) + + # (sequence_length X batch_size X shared_representation_size) + # -> (sequence_length X batch_size X 1 X shared_representation_size) + # -> (sequence_length X batch_size X 2 X shared_representation_size) + query_key = query_key.unsqueeze(2) * self.qk_weight + self.qk_bias + + # (sequence_length X batch_size X 2 X shared_representation_size) + # -> 2 tensors of (sequence_length X batch_size X shared_representation_size) + query, key = torch.unbind(query_key, dim=2) + + # (sequence_length X batch_size X dimension) + # -> (batch_size X sequence_length X dimension) + # where `dimension` is either shared_representation_size (queries and keys) or intermediate_size (values) + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + if self.config.is_decoder: + # combine history and current to save updated state (if history is provided) + # when chunking is applied, the past states will be None at the end of the chunk, in + # which case, proceed as if no K/V history had been provided + # saved states are stored with shape (batch_size X sequence_length X dimension) + if prev_self_key is not None: + key = torch.cat([prev_self_key, key], dim=1) + if prev_self_value is not None: + value = torch.cat([prev_self_value, value], dim=1) + + # if not chunking, store as-is + if not self.config.use_chunking: + updated_self_key = key + updated_self_value = value + else: + curr_len = key.size(1) % self.config.chunk_size + if curr_len == 0: + # if we're chunking and have reached the end of a chunk, wipe out the saved state + updated_self_key = None + updated_self_value = None + else: + updated_self_key = key + updated_self_value = value + + ctx_len = key.size(1) # potentially differs from seq_len because of incremental decoding + if not self.config.use_chunking: + # if we're not chunking, treat the entire sequence as one long chunk + # (batch_size X sequence_length X dimension) -> (batch_size X 1 X sequence_length X dimension) + query = query.unsqueeze(1) + key = key.unsqueeze(1) + value = value.unsqueeze(1) + if padding_mask is not None: + # (batch_size X sequence_length) -> (batch_size X 1 X sequence_length) + padding_mask = padding_mask.unsqueeze(1) + else: + # otherwise, split the sequences in the batch into `n_chunks` chunks of size `chunk_size` + if seq_len < self.config.chunk_size: + query = query.unsqueeze(1) + else: + # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension) + n_chunks = seq_len // self.config.chunk_size + query = query.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size) + + if ctx_len < self.config.chunk_size: + key = key.unsqueeze(1) + value = value.unsqueeze(1) + if padding_mask is not None: + padding_mask = padding_mask.unsqueeze(1) + else: + # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension) + n_chunks = ctx_len // self.config.chunk_size + key = key.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size) + value = value.reshape(bsz, n_chunks, self.config.chunk_size, self.config.intermediate_size) + if padding_mask is not None: + padding_mask = padding_mask.view(bsz, n_chunks, self.config.chunk_size) + + # this is in the original Mega implementation to work around fork/join parallelism not supporting optional types + if padding_mask is not None and padding_mask.dim() == 0: + padding_mask = None + + attn_weights = self.attention_function(query, key, padding_mask=padding_mask, causal_mask=causal_mask) + + value = self.hidden_dropout(value, batch_first=True) + kernel = self.attention_dropout(attn_weights) + + # (batch_size x n_chunks x chunk_size x intermediate_size) -> (sequence_length X batch_size X intermediate_size) + weighted_self_output = ( + torch.matmul(kernel, value).view(bsz, seq_len, self.config.intermediate_size).transpose(0, 1) + ) + + # (sequence_length X batch_size X intermediate_size) -> (sequence_length X batch_size X hidden_size) + weighted_self_output = self.activation(intermediate_state + self.h_proj(weighted_self_output * attention_gate)) + weighted_self_output = self.dropout(weighted_self_output) + # (sequence_length X batch_size X hidden_size) + out = torch.addcmul(residual, residual_weight, weighted_self_output - residual) + + if not self.config.normalize_before_mega: + out = self.norm(out) + + return_values = (out, attn_weights) if output_attentions else (out,) + + if self.config.is_decoder: + return_values = return_values + (updated_self_key, updated_self_value, updated_ema_state) + + return return_values + + +class MegaNormalizedFeedForwardNetwork(nn.Module): + """ + Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args + from Hugging Face config + """ + + def __init__(self, config: MegaConfig): + super().__init__() + + self.config = config + self.hidden_dim = config.nffn_hidden_size + self.act_fn = config.activation + self.activation = ACT2FN[config.activation] + + self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout) + self.hidden_dropout = MegaDropout( + self.config.nffn_activation_dropout_prob, is_featurewise=self.config.use_feature_dropout + ) + + self.prenorm = self.config.normalize_before_ffn + self.norm = MegaSequenceNorm( + self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine + ) + + self.fc1 = nn.Linear(self.config.hidden_size, self.config.nffn_hidden_size) + self.fc2 = nn.Linear(self.config.nffn_hidden_size, self.config.hidden_size) + + def forward(self, inputs): + residual = inputs + + if self.prenorm: + inputs = self.norm(inputs) + + hidden = self.activation(self.fc1(inputs)) + hidden = self.hidden_dropout(hidden) + output = self.fc2(hidden) + output = self.dropout(output) + output = output + residual + + if not self.prenorm: + output = self.norm(output) + + return output + + +class MegaBlock(nn.Module): + def __init__(self, config: MegaConfig): + super().__init__() + self.seq_len_dim = 1 + self.mega_layer = MegaMovingAverageGatedAttention(config) + self.nffn = MegaNormalizedFeedForwardNetwork(config) if config.use_normalized_ffn else None + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.cross_attn = MegaGatedCrossAttention(config) + else: + self.cross_attn = None + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + causal_mask: Optional[torch.LongTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: bool = False, + ) -> tuple[torch.Tensor]: + """ + A single Mega layer: either encoder or decoder, with optional cross-attention and optional normalized + feed-forward layer + + Args: + hidden_states (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`): + Hidden states to be updated by the Mega block + attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indicates which entries in the self/target sequence are to be ignored (mostly due to padding), where + elements are either 1 for *not masked* or 0 for *masked*. Causal attention is enforced internally. + causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*): + Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not + masked* or 0 for *masked* + encoder_hidden_states (`torch.Tensor`, of shape `(source_sequence_length, batch_size, hidden_size)`, *optional*): + Encoder hidden states to be used for cross-attention (and required for encoder-decoder model setup) + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*): + Indicates which entries in the cross/source sequence are to be ignored (mostly due to padding), where + elements are either 1 for *not masked* or 0 for *masked*. + past_key_values (`tuple(torch.Tensor)`, *optional*): + The hidden states returned from the previous timestep during incremental decoding; expects that + self-attention key, value, and EMA states are the first 3 entries in the tuple, and (if doing + cross-attention) cross-attention key and value are the last 2 entries in the tuple + output_attentions (`bool`, default `False`): + Whether to return self-attention weights + use_cache (`bool`, default `False`): + Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated + states for use in the next step + + Returns: + `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and + inputs: + - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) -- + Hidden states from target sequence updated by Mega + - **self_attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape + `(batch_size, 1, target_sequence_length, target_sequence_length)` -- The self-attention weights + corresponding to how each token in the input sequence attends to every other token + - **cross_attn_weights** (*optional*, returned when `output_attentions=True` and + `config.add_cross_attention=True`) `torch.FloatTensor` of shape `(batch_size, source_sequence_length, + target_sequence_length)` -- Pairwise cross-attention weights between every entry in the source sequence + and target sequence + - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size, + sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next + step of incremental decoding + - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size, + sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of + incremental decoding + - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape + `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding. + - **cross_key** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`) + `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.shared_representation_size)` -- + The cross-attention key state for use in the next step of incremental decoding + - **cross_value** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`) + `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.hidden_size)` -- The + cross-attention value state for use in the next step of incremental decoding + """ + + # incremental decoding in the MegaMultiDimensionDampedEma module requires that the attention mask has the same + # sequence length as the input tensor; if we're caching incremental states, we assume the input + # sequence length is 1 (Mega will break otherwise), so we take the padding mask for the final + # token in the input (mask is received as [batch X sequence length]) + if use_cache and (past_key_values is not None) and (attention_mask is not None): + mega_padding_mask = attention_mask[:, -1].unsqueeze(-1) + else: + mega_padding_mask = attention_mask + + mega_outputs = self.mega_layer( + input=hidden_states, + padding_mask=mega_padding_mask, + causal_mask=causal_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + new_hidden_states = mega_outputs[0] + self_key, self_value, self_ema_state = mega_outputs[-3:] if use_cache else (None, None, None) + self_attention_weights = mega_outputs[1] if output_attentions else None + + # optional cross attention + if self.cross_attn is not None: + if encoder_hidden_states is None: + raise ValueError("Requested cross-attention without providing encoder hidden states") + + cross_attn_outputs = self.cross_attn( + query=new_hidden_states, + key=encoder_hidden_states, + value=encoder_hidden_states, + key_padding_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + # update the hidden state from cross attention + new_hidden_states = cross_attn_outputs[0] + # store cross-attention k/v if caching + cross_key, cross_value = cross_attn_outputs[-2:] if use_cache else (None, None) + cross_attention_weights = cross_attn_outputs[1] if output_attentions else None + + # optional NFFN follows cross attention + if self.nffn is not None: + new_hidden_states = self.nffn(new_hidden_states) + + outs = (new_hidden_states,) + if output_attentions: + outs = outs + (self_attention_weights,) + if self.cross_attn is not None: + outs = outs + (cross_attention_weights,) + + if use_cache: + new_key_values = ( + self_key, + self_value, + self_ema_state, + ) + if self.cross_attn is not None: + new_key_values = new_key_values + (cross_key, cross_value) + + outs = outs + (new_key_values,) + + return outs + + +# copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->Mega +class MegaPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class MegaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: MegaConfig + base_model_prefix = "mega" + supports_gradient_checkpointing = False + _no_split_modules = ["MegaMovingAverageGatedAttention"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, MegaMultiDimensionDampedEma): + with torch.no_grad(): + # delta & alpha + nn.init.normal_(module.damping_factor, mean=0.0, std=self.config.ema_delta_alpha_range) + nn.init.normal_(module.decay_factor, mean=0.0, std=self.config.ema_delta_alpha_range) + # beta [1, -1, 1, -1, ...] seems more stable. + val = torch.ones(self.config.ema_projection_size, 1) + if self.config.ema_projection_size > 1: + idx = torch.tensor(list(range(1, self.config.ema_projection_size, 2))) + val.index_fill_(0, idx, -1.0) + module.ema_expansion_matrix.normal_(mean=0.0, std=self.config.ema_beta_range).add_(val) + # gamma & omega + nn.init.normal_(module.kernel_projection_matrix, mean=0.0, std=self.config.ema_gamma_omega_range) + nn.init.normal_(module.residual_weight, mean=0.0, std=self.config.ema_gamma_omega_range) + elif isinstance(module, MegaSimpleRelativePositionalBias): + nn.init.normal_(module.rel_pos_bias, mean=0.0, std=self.config.initializer_range) + elif isinstance(module, MegaRotaryRelativePositionalBias): + nn.init.normal_(module.alpha, mean=0.0, std=self.config.initializer_range) + nn.init.normal_(module.b_param, mean=0.0, std=self.config.initializer_range) + elif isinstance(module, MegaScaleNorm): + if self.config.norm_affine: + nn.init.constant_(module.scalar, 1.0) + elif isinstance(module, MegaRMSNorm): + if self.config.norm_affine: + nn.init.constant_(module.weight, 1.0) + elif isinstance(module, MegaMovingAverageGatedAttention): + # linear layers covered separately by the generic nn.Linear init below + nn.init.normal_(module.qk_weight, mean=0.0, std=self.config.initializer_range) + nn.init.constant_(module.qk_bias, 0.0) + elif isinstance(module, nn.Linear): + # initializes all linear layers in the entire network + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +MEGA_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MegaConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +MEGA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `add_token_type_embeddings` parameter + set to `True`. All the value in this tensor should be always < config.type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.", + MEGA_START_DOCSTRING, +) +class MegaModel(MegaPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average + Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, + Jonathan May, and Luke Zettlemoyer + + To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to + `True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both + `is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an + `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Mega: Moving Average Equipped Gated Attention*: https://huggingface.co/papers/2209.10655 + + """ + + def __init__(self, config: MegaConfig, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embedding_layer = MegaEmbeddings(config) + self.layers = nn.ModuleList([MegaBlock(config) for _ in range(config.num_hidden_layers)]) + + self.pooler = MegaPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing (retained from RoBERTa code) + self.post_init() + + def get_input_embeddings(self): + return self.embedding_layer.word_embeddings + + def set_input_embeddings(self, value): + self.embedding_layer.word_embeddings = value + + @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + device = inputs_embeds.device + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if self.config.use_chunking: + input_shape = torch.tensor([input_shape[0], self.config.chunk_size]) + + batch_size, sequence_length = input_shape + + if self.config.use_chunking and (sequence_length > self.config.chunk_size): + if sequence_length % self.config.chunk_size != 0: + raise ValueError( + f"config.use_chunking is activated; input sequence length must be shorter than or a multiple of config.chunk_size\nreceived sequence length of {sequence_length} with chunk size {self.config.chunk_size}" + ) + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + + # Mega expects the causal mask to be a 2D square matrix of (from) x (to) over the input sequence length + # the HF utility function generates a 3D causal mask which includes batch size, so we'll create a dummy + # mask with the correct device and all ones + temp_mask_for_extension = torch.ones((1, sequence_length), dtype=torch.long, device=device) + causal_mask = self.create_extended_attention_mask_for_decoder(input_shape, temp_mask_for_extension) + + # get rid of batch dimension in the generated mask; result is (sequence_length X sequence_length) + causal_mask = causal_mask.squeeze(0) + else: + use_cache = False + causal_mask = None + + # if using cache, make sure we have a tuple of tuples which matches the length of our hidden layers + if (past_key_values is not None) and (len(past_key_values) != self.config.num_hidden_layers): + raise ValueError( + f"Received past key/value cache with size mismatch; expected {self.config.num_hidden_layers}, received {len(past_key_values)}" + ) + + # get embeddings (batch X sequence length X embed dim) + embedding_output = self.embedding_layer( + input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + + # transpose for Mega --> (seq len X batch X embed dim) + hidden_states = embedding_output.transpose(0, 1) + + # we expect encoder hidden states to also have batch first in line + # with typical Hugging Face behavior (which is also how we return them) + # Mega expects sequence length first, so do the same transpose here + if encoder_hidden_states is not None: + encoder_hidden_states = encoder_hidden_states.transpose(0, 1) + + # pass through mega layers + all_hidden_states = (embedding_output,) if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + next_decoder_cache = () if use_cache else None + for i, mega_layer in enumerate(self.layers): + current_decoder_cache = past_key_values[i] if past_key_values is not None else None + mega_outputs = mega_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_mask=causal_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=current_decoder_cache, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = mega_outputs[0] + if output_hidden_states: + # store layer-wise hidden states in the way that the user expects + # (seq len X batch X embed dim) --> (batch X seq len X embed dim) + all_hidden_states += (hidden_states.transpose(0, 1),) + if output_attentions: + self_attn_weights = mega_outputs[1] + all_self_attentions += (self_attn_weights,) + if self.config.add_cross_attention: + cross_attn_weights = mega_outputs[2] + all_cross_attentions += (cross_attn_weights,) + if use_cache: + updated_cache = mega_outputs[-1] + next_decoder_cache += (updated_cache,) + + # transpose final hidden states + hidden_states = hidden_states.transpose(0, 1) + + # optional pooling layer + pooled_output = self.pooler(hidden_states) if self.pooler is not None else None + + if not return_dict: + return (hidden_states, pooled_output) + ( + all_hidden_states, + next_decoder_cache, + all_self_attentions, + all_cross_attentions, + ) + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=hidden_states, + pooler_output=pooled_output, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING +) +class MegaForCausalLM(MegaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: MegaConfig): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`") + + self.mega = MegaModel(config, add_pooling_layer=False) + + if config.add_lm_hidden_dense_layer: + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.hidden_activation = nn.Tanh() + else: + self.dense = None + self.hidden_activation = None + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MegaForCausalLM, AutoConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("mnaylor/mega-base-wikitext") + >>> config = AutoConfig.from_pretrained("mnaylor/mega-base-wikitext") + >>> config.is_decoder = True + >>> config.bidirectional = False + >>> model = MegaForCausalLM.from_pretrained( + ... "mnaylor/mega-base-wikitext", config=config, ignore_mismatched_sizes=True + ... ) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.mega( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + if self.dense is not None: + sequence_output = self.dense(sequence_output) + sequence_output = self.hidden_activation(sequence_output) + + prediction_scores = self.lm_head(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING) +class MegaForMaskedLM(MegaPreTrainedModel): + _tied_weights_keys = ["mlm_head.weight"] + + def __init__(self, config: MegaConfig): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.mega = MegaModel(config, add_pooling_layer=False) + if config.add_lm_hidden_dense_layer: + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.hidden_activation = nn.Tanh() + else: + self.dense = None + self.hidden_activation = None + self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size) + self.dropout = nn.Dropout(config.dropout_prob) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.mlm_head + + def set_output_embeddings(self, new_embeddings): + self.mlm_head = new_embeddings + + @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="", + expected_output="' Paris'", + expected_loss=0.1, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + kwargs (`dict[str, any]`, optional, defaults to *{}*): + Used to hide legacy arguments that have been deprecated. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mega( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + if self.dense is not None: + sequence_output = self.dense(sequence_output) + sequence_output = self.hidden_activation(sequence_output) + prediction_scores = self.mlm_head(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + MEGA_START_DOCSTRING, +) +class MegaForSequenceClassification(MegaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.mega = MegaModel(config, add_pooling_layer=False) + self.classifier = MegaClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mega( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + MEGA_START_DOCSTRING, +) +class MegaForMultipleChoice(MegaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.mega = MegaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.mega( + flat_input_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + MEGA_START_DOCSTRING, +) +class MegaForTokenClassification(MegaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.mega = MegaModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mega( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +# copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Mega +class MegaClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@add_start_docstrings( + """ + MEGA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + MEGA_START_DOCSTRING, +) +class MegaForQuestionAnswering(MegaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.mega = MegaModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mega( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "MegaForCausalLM", + "MegaForMaskedLM", + "MegaForMultipleChoice", + "MegaForQuestionAnswering", + "MegaForSequenceClassification", + "MegaForTokenClassification", + "MegaModel", + "MegaPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..03b556e2eddf6d5f81f6f4a15346596dd5a32f85 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mmbt import * + from .modeling_mmbt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py new file mode 100644 index 0000000000000000000000000000000000000000..1c58e4e6cdd0d0a8c87b9b94ca896c87970b5654 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py @@ -0,0 +1,45 @@ +# coding=utf-8 +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MMBT configuration""" + +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class MMBTConfig: + """ + This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT + model according to the specified arguments, defining the model architecture. + + Args: + config ([`PreTrainedConfig`]): + Config of the underlying Transformer models. Its values are copied over to use a single config. + num_labels (`int`, *optional*): + Size of final Linear layer for classification. + modal_hidden_size (`int`, *optional*, defaults to 2048): + Embedding dimension of the non-text modality encoder. + """ + + def __init__(self, config, num_labels=None, modal_hidden_size=2048): + self.__dict__ = config.__dict__ + self.modal_hidden_size = modal_hidden_size + if num_labels: + self.num_labels = num_labels + + +__all__ = ["MMBTConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py new file mode 100644 index 0000000000000000000000000000000000000000..45ae577f7fced2d276f4f54c5bf859e27e08ebae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py @@ -0,0 +1,410 @@ +# coding=utf-8 +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch MMBT model.""" + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput +from ....modeling_utils import ModuleUtilsMixin +from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MMBTConfig" + + +class ModalEmbeddings(nn.Module): + """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.""" + + def __init__(self, config, encoder, embeddings): + super().__init__() + self.config = config + self.encoder = encoder + self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size) + self.position_embeddings = embeddings.position_embeddings + self.token_type_embeddings = embeddings.token_type_embeddings + self.word_embeddings = embeddings.word_embeddings + self.LayerNorm = embeddings.LayerNorm + self.dropout = nn.Dropout(p=config.hidden_dropout_prob) + + def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None): + token_embeddings = self.proj_embeddings(self.encoder(input_modal)) + seq_length = token_embeddings.size(1) + + if start_token is not None: + start_token_embeds = self.word_embeddings(start_token) + seq_length += 1 + token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1) + + if end_token is not None: + end_token_embeds = self.word_embeddings(end_token) + seq_length += 1 + token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1) + + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device) + position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length) + + if token_type_ids is None: + token_type_ids = torch.zeros( + (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device + ) + + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings = token_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +MMBT_START_DOCSTRING = r""" + MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and + Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine. + It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and + obtain state-of-the-art performance on various multimodal classification benchmark tasks. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MMBTConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. + transformer (`nn.Module`): A text transformer that is used by MMBT. + It should have embeddings, encoder, and pooler attributes. + encoder (`nn.Module`): Encoder for the second modality. + It should take in a batch of modal inputs and return k, n dimension embeddings. +""" + +MMBT_INPUTS_DOCSTRING = r""" + Args: + input_modal (`torch.FloatTensor` of shape `(batch_size, ***)`): + The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image + Encoder, the shape would be (batch_size, channels, height, width) + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's + appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + modal_start_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for classification + tasks. + modal_end_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used. + attention_mask (*optional*) `torch.FloatTensor` of shape `(batch_size, sequence_length)`: + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, sequence_length)`: + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + modal_token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, modal_sequence_length)`: + Segment token indices to indicate different portions of the non-text modality. The embeddings from these + tokens will be summed with the respective token embeddings for the non-text modality. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + modal_position_ids (`torch.LongTensor` of shape `(batch_size, modal_sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings for the non-text modality. + Selected in the range `[0, config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, embedding_dim)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare MMBT Model outputting raw hidden-states without any specific head on top.", + MMBT_START_DOCSTRING, +) +class MMBTModel(nn.Module, ModuleUtilsMixin): + def __init__(self, config, transformer, encoder): + super().__init__() + self.config = config + self.transformer = transformer + self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings) + + @add_start_docstrings_to_model_forward(MMBT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_modal, + input_ids=None, + modal_start_tokens=None, + modal_end_tokens=None, + attention_mask=None, + token_type_ids=None, + modal_token_type_ids=None, + position_ids=None, + modal_position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: + + Examples: + + ```python + # For example purposes. Not runnable. + transformer = BertModel.from_pretrained("google-bert/bert-base-uncased") + encoder = ImageEncoder(args) + mmbt = MMBTModel(config, transformer, encoder) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_txt_shape = input_ids.size() + elif inputs_embeds is not None: + input_txt_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + modal_embeddings = self.modal_encoder( + input_modal, + start_token=modal_start_tokens, + end_token=modal_end_tokens, + position_ids=modal_position_ids, + token_type_ids=modal_token_type_ids, + ) + + input_modal_shape = modal_embeddings.size()[:-1] + + if token_type_ids is None: + token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device) + + txt_embeddings = self.transformer.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + + embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1) + + input_shape = embedding_output.size()[:-1] + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + else: + attention_mask = torch.cat( + [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1 + ) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(input_shape, device=device) + else: + encoder_attention_mask = torch.cat( + [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1 + ) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.transformer.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.transformer.pooler(sequence_output) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + +@add_start_docstrings( + """ + MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) + """, + MMBT_START_DOCSTRING, + MMBT_INPUTS_DOCSTRING, +) +class MMBTForClassification(nn.Module): + r""" + **labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`: + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**: + (*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or + regression if config.num_labels==1) loss. **logits**: + `torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if + config.num_labels==1) scores (before SoftMax). + **hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for + the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**: + (*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used + to compute the weighted average in the self-attention heads. + + Examples: + + ```python + # For example purposes. Not runnable. + transformer = BertModel.from_pretrained("google-bert/bert-base-uncased") + encoder = ImageEncoder(args) + model = MMBTForClassification(config, transformer, encoder) + outputs = model(input_modal, input_ids, labels=labels) + loss, logits = outputs[:2] + ```""" + + def __init__(self, config, transformer, encoder): + super().__init__() + self.num_labels = config.num_labels + + self.mmbt = MMBTModel(config, transformer, encoder) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + def forward( + self, + input_modal, + input_ids=None, + modal_start_tokens=None, + modal_end_tokens=None, + attention_mask=None, + token_type_ids=None, + modal_token_type_ids=None, + position_ids=None, + modal_position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + return_dict=None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mmbt( + input_modal=input_modal, + input_ids=input_ids, + modal_start_tokens=modal_start_tokens, + modal_end_tokens=modal_end_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + modal_token_type_ids=modal_token_type_ids, + position_ids=position_ids, + modal_position_ids=modal_position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c5373969ce7831491b0d5fa5495078fb1d3f6e4e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_nat import * + from .modeling_nat import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/configuration_nat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/configuration_nat.py new file mode 100644 index 0000000000000000000000000000000000000000..002eaa7f82b6927ab383523e9b9a6abc5c2433a9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/configuration_nat.py @@ -0,0 +1,148 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Neighborhood Attention Transformer model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging +from ....utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class NatConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Nat + [shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embed_dim (`int`, *optional*, defaults to 64): + Dimensionality of patch embedding. + depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 5]`): + Number of layers in each level of the encoder. + num_heads (`list[int]`, *optional*, defaults to `[2, 4, 8, 16]`): + Number of attention heads in each layer of the Transformer encoder. + kernel_size (`int`, *optional*, defaults to 7): + Neighborhood Attention kernel size. + mlp_ratio (`float`, *optional*, defaults to 3.0): + Ratio of MLP hidden dimensionality to embedding dimensionality. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not a learnable bias should be added to the queries, keys and values. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + layer_scale_init_value (`float`, *optional*, defaults to 0.0): + The initial value for the layer scale. Disabled if <=0. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + + Example: + + ```python + >>> from transformers import NatConfig, NatModel + + >>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration + >>> configuration = NatConfig() + + >>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration + >>> model = NatModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "nat" + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + patch_size=4, + num_channels=3, + embed_dim=64, + depths=[3, 4, 6, 5], + num_heads=[2, 4, 8, 16], + kernel_size=7, + mlp_ratio=3.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + initializer_range=0.02, + layer_norm_eps=1e-5, + layer_scale_init_value=0.0, + out_features=None, + out_indices=None, + **kwargs, + ): + super().__init__(**kwargs) + + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_layers = len(depths) + self.num_heads = num_heads + self.kernel_size = kernel_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) + self.layer_scale_init_value = layer_scale_init_value + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + + +__all__ = ["NatConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/modeling_nat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/modeling_nat.py new file mode 100644 index 0000000000000000000000000000000000000000..a619cdb11225779eb6ab499840658754be65c9ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/modeling_nat.py @@ -0,0 +1,932 @@ +# coding=utf-8 +# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Neighborhood Attention Transformer model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ....activations import ACT2FN +from ....modeling_outputs import BackboneOutput +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import ( + ModelOutput, + OptionalDependencyNotAvailable, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_natten_available, + logging, + replace_return_docstrings, + requires_backends, +) +from ....utils.backbone_utils import BackboneMixin +from .configuration_nat import NatConfig + + +if is_natten_available(): + from natten.functional import natten2dav, natten2dqkrpb +else: + + def natten2dqkrpb(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + def natten2dav(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "NatConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224" +_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "shi-labs/nat-mini-in1k-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat" + + +# drop_path and NatDropPath are from the timm library. + + +@dataclass +class NatEncoderOutput(ModelOutput): + """ + Nat encoder's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class NatModelOutput(ModelOutput): + """ + Nat model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): + Average pooling of the last layer hidden-state. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class NatImageClassifierOutput(ModelOutput): + """ + Nat outputs for image classification. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +class NatEmbeddings(nn.Module): + """ + Construct the patch and position embeddings. + """ + + def __init__(self, config): + super().__init__() + + self.patch_embeddings = NatPatchEmbeddings(config) + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor]: + embeddings = self.patch_embeddings(pixel_values) + embeddings = self.norm(embeddings) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class NatPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + patch_size = config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + self.num_channels = num_channels + + if patch_size == 4: + pass + else: + # TODO: Support arbitrary patch sizes. + raise ValueError("Dinat only supports patch size of 4 at the moment.") + + self.projection = nn.Sequential( + nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + ) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor: + _, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.projection(pixel_values) + embeddings = embeddings.permute(0, 2, 3, 1) + + return embeddings + + +class NatDownsampler(nn.Module): + """ + Convolutional Downsampling Layer. + + Args: + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: + super().__init__() + self.dim = dim + self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, input_feature: torch.Tensor) -> torch.Tensor: + input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + input_feature = self.norm(input_feature) + return input_feature + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class NatDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class NeighborhoodAttention(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.kernel_size = kernel_size + + # rpb is learnable relative positional biases; same concept is used Swin. + self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1))) + + self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 3, 1, 2, 4) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Apply the scale factor before computing attention weights. It's usually more efficient because + # attention weights are typically a bigger tensor compared to query. + # It gives identical results because scalars are commutable in matrix multiplication. + query_layer = query_layer / math.sqrt(self.attention_head_size) + + # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, 1) + context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class NeighborhoodAttentionOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class NeighborhoodAttentionModule(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size): + super().__init__() + self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size) + self.output = NeighborhoodAttentionOutput(config, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self(hidden_states, output_attentions) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class NatIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class NatOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class NatLayer(nn.Module): + def __init__(self, config, dim, num_heads, drop_path_rate=0.0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.kernel_size = config.kernel_size + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attention = NeighborhoodAttentionModule(config, dim, num_heads, kernel_size=self.kernel_size) + self.drop_path = NatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = NatIntermediate(config, dim) + self.output = NatOutput(config, dim) + self.layer_scale_parameters = ( + nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True) + if config.layer_scale_init_value > 0 + else None + ) + + def maybe_pad(self, hidden_states, height, width): + window_size = self.kernel_size + pad_values = (0, 0, 0, 0, 0, 0) + if height < window_size or width < window_size: + pad_l = pad_t = 0 + pad_r = max(0, window_size - width) + pad_b = max(0, window_size - height) + pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, height, width, channels = hidden_states.size() + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states) + # pad hidden_states if they are smaller than kernel size + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + + attention_outputs = self.attention(hidden_states, output_attentions=output_attentions) + + attention_output = attention_outputs[0] + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_output = attention_output[:, :height, :width, :].contiguous() + + if self.layer_scale_parameters is not None: + attention_output = self.layer_scale_parameters[0] * attention_output + + hidden_states = shortcut + self.drop_path(attention_output) + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.output(self.intermediate(layer_output)) + + if self.layer_scale_parameters is not None: + layer_output = self.layer_scale_parameters[1] * layer_output + + layer_output = hidden_states + self.drop_path(layer_output) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +class NatStage(nn.Module): + def __init__(self, config, dim, depth, num_heads, drop_path_rate, downsample): + super().__init__() + self.config = config + self.dim = dim + self.layers = nn.ModuleList( + [ + NatLayer( + config=config, + dim=dim, + num_heads=num_heads, + drop_path_rate=drop_path_rate[i], + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm) + else: + self.downsample = None + + self.pointing = False + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + _, height, width, _ = hidden_states.size() + for i, layer_module in enumerate(self.layers): + layer_outputs = layer_module(hidden_states, output_attentions) + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = hidden_states + if self.downsample is not None: + hidden_states = self.downsample(hidden_states_before_downsampling) + + stage_outputs = (hidden_states, hidden_states_before_downsampling) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +class NatEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.num_levels = len(config.depths) + self.config = config + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] + self.levels = nn.ModuleList( + [ + NatStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=NatDownsampler if (i_layer < self.num_levels - 1) else None, + ) + for i_layer in range(self.num_levels) + ] + ) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + output_hidden_states_before_downsampling: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple, NatEncoderOutput]: + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.levels): + layer_outputs = layer_module(hidden_states, output_attentions) + + hidden_states = layer_outputs[0] + hidden_states_before_downsampling = layer_outputs[1] + + if output_hidden_states and output_hidden_states_before_downsampling: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states_before_downsampling,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + elif output_hidden_states and not output_hidden_states_before_downsampling: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[2:] + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return NatEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +class NatPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: NatConfig + base_model_prefix = "nat" + main_input_name = "pixel_values" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +NAT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`NatConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +NAT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] + for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Nat Model transformer outputting raw hidden-states without any specific head on top.", + NAT_START_DOCSTRING, +) +class NatModel(NatPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.config = config + self.num_levels = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1)) + + self.embeddings = NatEmbeddings(config) + self.encoder = NatEncoder(config) + + self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=NatModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, NatModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2)) + pooled_output = torch.flatten(pooled_output, 1) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + + return output + + return NatModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + + +@add_start_docstrings( + """ + Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + """, + NAT_START_DOCSTRING, +) +class NatForImageClassification(NatPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.num_labels = config.num_labels + self.nat = NatModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(self.nat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=NatImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, NatImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nat( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return NatImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) + + +@add_start_docstrings( + "NAT backbone, to be used with frameworks like DETR and MaskFormer.", + NAT_START_DOCSTRING, +) +class NatBackbone(NatPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + requires_backends(self, ["natten"]) + + self.embeddings = NatEmbeddings(config) + self.encoder = NatEncoder(config) + self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))] + + # Add layer norms to hidden states of out_features + hidden_states_norms = {} + for stage, num_channels in zip(self.out_features, self.channels): + hidden_states_norms[stage] = nn.LayerNorm(num_channels) + self.hidden_states_norms = nn.ModuleDict(hidden_states_norms) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") + >>> model = AutoBackbone.from_pretrained( + ... "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 512, 7, 7] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=True, + output_hidden_states_before_downsampling=True, + return_dict=True, + ) + + hidden_states = outputs.reshaped_hidden_states + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + # TODO can we simplify this? + batch_size, num_channels, height, width = hidden_state.shape + hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous() + hidden_state = hidden_state.view(batch_size, height * width, num_channels) + hidden_state = self.hidden_states_norms[stage](hidden_state) + hidden_state = hidden_state.view(batch_size, height, width, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) + + if not return_dict: + output = (feature_maps,) + if output_hidden_states: + output += (outputs.hidden_states,) + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["NatForImageClassification", "NatModel", "NatPreTrainedModel", "NatBackbone"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f0690129ae9edf84829278790b5e065bbd6608ee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_nezha import * + from .modeling_nezha import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py new file mode 100644 index 0000000000000000000000000000000000000000..00d193cd1ae68655e1631b7caea2220987a29f0b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py @@ -0,0 +1,105 @@ +from .... import PretrainedConfig + + +class NezhaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`NezhaModel`]. It is used to instantiate an Nezha + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Nezha + [sijunhe/nezha-cn-base](https://huggingface.co/sijunhe/nezha-cn-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, optional, defaults to 21128): + Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the + *inputs_ids* passed to the forward method of [`NezhaModel`]. + hidden_size (`int`, optional, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, optional, defaults to 3072): + The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, optional, defaults to "gelu"): + The non-linear activation function (function or string) in the encoder and pooler. + hidden_dropout_prob (`float`, optional, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, optional, defaults to 2): + The vocabulary size of the *token_type_ids* passed into [`NezhaModel`]. + initializer_range (`float`, optional, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + classifier_dropout (`float`, optional, defaults to 0.1): + The dropout ratio for attached classifiers. + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + + Example: + + ```python + >>> from transformers import NezhaConfig, NezhaModel + + >>> # Initializing an Nezha configuration + >>> configuration = NezhaConfig() + + >>> # Initializing a model (with random weights) from the Nezha-base style configuration model + >>> model = NezhaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "nezha" + + def __init__( + self, + vocab_size=21128, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + max_relative_position=64, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + classifier_dropout=0.1, + pad_token_id=0, + bos_token_id=2, + eos_token_id=3, + use_cache=True, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.max_relative_position = max_relative_position + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.classifier_dropout = classifier_dropout + self.use_cache = use_cache + + +__all__ = ["NezhaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py new file mode 100644 index 0000000000000000000000000000000000000000..ddfecac9f50651ad16ff53814b262435e7edcc3e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py @@ -0,0 +1,1689 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Nezha model.""" + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ....utils.deprecation import deprecate_kwarg +from .configuration_nezha import NezhaConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "sijunhe/nezha-cn-base" +_CONFIG_FOR_DOC = "NezhaConfig" + + +def load_tf_weights_in_nezha(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class NezhaRelativePositionsEncoding(nn.Module): + """Implement the Functional Relative Position Encoding""" + + def __init__(self, length, depth, max_relative_position=127): + super().__init__() + vocab_size = max_relative_position * 2 + 1 + range_vec = torch.arange(length) + range_mat = range_vec.repeat(length).view(length, length) + distance_mat = range_mat - torch.t(range_mat) + distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position) + final_mat = distance_mat_clipped + max_relative_position + + embeddings_table = torch.zeros(vocab_size, depth) + position = torch.arange(0, vocab_size, dtype=torch.int64).float().unsqueeze(1) + div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth)) + embeddings_table[:, 0::2] = torch.sin(position * div_term) + embeddings_table[:, 1::2] = torch.cos(position * div_term) + + flat_relative_positions_matrix = final_mat.view(-1) + one_hot_relative_positions_matrix = torch.nn.functional.one_hot( + flat_relative_positions_matrix, num_classes=vocab_size + ).float() + positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table) + my_shape = list(final_mat.size()) + my_shape.append(depth) + positions_encoding = positions_encoding.view(my_shape) + self.register_buffer("positions_encoding", positions_encoding, persistent=False) + + def forward(self, length): + return self.positions_encoding[:length, :length, :] + + +class NezhaEmbeddings(nn.Module): + """Construct the embeddings from word and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.register_buffer( + "token_type_ids", torch.zeros((1, config.max_position_embeddings), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=inputs_embeds.device) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class NezhaSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.relative_positions_encoding = NezhaRelativePositionsEncoding( + length=config.max_position_embeddings, + depth=self.attention_head_size, + max_relative_position=config.max_relative_position, + ) + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_values is not None: + # reuse k,v, cross_attentions + key_layer = past_key_values[0] + value_layer = past_key_values[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_values is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_values[0], key_layer], dim=2) + value_layer = torch.cat([past_key_values[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size() + relations_keys = self.relative_positions_encoding(to_seq_length) + query_layer_t = query_layer.permute(2, 0, 1, 3) + + query_layer_r = query_layer_t.contiguous().view( + from_seq_length, batch_size * num_attention_heads, self.attention_head_size + ) + key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1)) + key_position_scores_r = key_position_scores.view( + from_seq_length, batch_size, num_attention_heads, from_seq_length + ) + key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3) + attention_scores = attention_scores + key_position_scores_r_t + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in NezhaModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + relations_values = self.relative_positions_encoding(to_seq_length) + attention_probs_t = attention_probs.permute(2, 0, 1, 3) + attentions_probs_r = attention_probs_t.contiguous().view( + from_seq_length, batch_size * num_attention_heads, to_seq_length + ) + value_position_scores = torch.matmul(attentions_probs_r, relations_values) + value_position_scores_r = value_position_scores.view( + from_seq_length, batch_size, num_attention_heads, self.attention_head_size + ) + value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3) + context_layer = context_layer + value_position_scores_r_t + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_values,) + return outputs + + +class NezhaSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class NezhaAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = NezhaSelfAttention(config) + self.output = NezhaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class NezhaIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class NezhaOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class NezhaLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = NezhaAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = NezhaAttention(config) + self.intermediate = NezhaIntermediate(config) + self.output = NezhaOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_values=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple + cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class NezhaEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([NezhaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values[i] if past_key_values is not None else None, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class NezhaPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class NezhaPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class NezhaLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = NezhaPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class NezhaOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = NezhaLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class NezhaOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class NezhaPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = NezhaLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class NezhaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: NezhaConfig + load_tf_weights = load_tf_weights_in_nezha + base_model_prefix = "nezha" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@dataclass +class NezhaForPreTrainingOutput(ModelOutput): + """ + Output type of [`NezhaForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: Optional[torch.FloatTensor] = None + seq_relationship_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +NEZHA_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`NezhaConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +NEZHA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Nezha Model transformer outputting raw hidden-states without any specific head on top.", + NEZHA_START_DOCSTRING, +) +class NezhaModel(NezhaPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = NezhaEmbeddings(config) + self.encoder = NezhaEncoder(config) + + self.pooler = NezhaPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Nezha Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + NEZHA_START_DOCSTRING, +) +class NezhaForPreTraining(NezhaPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder"] + + def __init__(self, config): + super().__init__(config) + + self.nezha = NezhaModel(config) + self.cls = NezhaPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + next_sentence_label: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], NezhaForPreTrainingOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), + the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence + pair (see `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`dict[str, any]`, optional, defaults to *{}*): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, NezhaForPreTraining + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base") + >>> model = NezhaForPreTraining.from_pretrained("sijunhe/nezha-cn-base") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nezha( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return NezhaForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING) +class NezhaForMaskedLM(NezhaPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `NezhaForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.nezha = NezhaModel(config, add_pooling_layer=False) + self.cls = NezhaOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nezha( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@add_start_docstrings( + """Nezha Model with a `next sentence prediction (classification)` head on top.""", + NEZHA_START_DOCSTRING, +) +class NezhaForNextSentencePrediction(NezhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.nezha = NezhaModel(config) + self.cls = NezhaOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], NextSentencePredictorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring). Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, NezhaForNextSentencePrediction + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base") + >>> model = NezhaForNextSentencePrediction.from_pretrained("sijunhe/nezha-cn-base") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt") + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ``` + """ + + if "next_sentence_label" in kwargs: + warnings.warn( + "The `next_sentence_label` argument is deprecated and will be removed in a future version, use" + " `labels` instead.", + FutureWarning, + ) + labels = kwargs.pop("next_sentence_label") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nezha( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Nezha Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + NEZHA_START_DOCSTRING, +) +class NezhaForSequenceClassification(NezhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.nezha = NezhaModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nezha( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Nezha Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + NEZHA_START_DOCSTRING, +) +class NezhaForMultipleChoice(NezhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.nezha = NezhaModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.nezha( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + print(pooled_output.shape) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + print(logits.shape) + print(num_choices) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Nezha Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + NEZHA_START_DOCSTRING, +) +class NezhaForTokenClassification(NezhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.nezha = NezhaModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nezha( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Nezha Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + NEZHA_START_DOCSTRING, +) +class NezhaForQuestionAnswering(NezhaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.nezha = NezhaModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.nezha( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "NezhaForNextSentencePrediction", + "NezhaForMaskedLM", + "NezhaForPreTraining", + "NezhaForMultipleChoice", + "NezhaForQuestionAnswering", + "NezhaForSequenceClassification", + "NezhaForTokenClassification", + "NezhaModel", + "NezhaPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2b3964d194bed041987c6236b5c60bfcf3b7caf4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2023 EleutherAI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_open_llama import * + from .modeling_open_llama import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..b4bc9cc72a7661fe571cce6649b40f2307d9b3ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py @@ -0,0 +1,169 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Open-Llama model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class OpenLlamaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`OpenLlamaModel`]. It is used to instantiate an + Open-Llama model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the + [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Open-Llama model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`OpenLlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + + Example: + + ```python + >>> from transformers import OpenLlamaModel, OpenLlamaConfig + + >>> # Initializing a Open-Llama open_llama-7b style configuration + >>> configuration = OpenLlamaConfig() + + >>> # Initializing a model from the open_llama-7b style configuration + >>> model = OpenLlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "open-llama" + + def __init__( + self, + vocab_size=100000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + use_memory_efficient_attention=True, + hidden_dropout_prob=0.1, + attention_dropout_prob=0.1, + use_stable_embedding=True, + shared_input_output_embedding=True, + rope_theta=10000.0, + rope_scaling=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.use_memory_efficient_attention = kwargs.pop( + "use_memorry_efficient_attention", use_memory_efficient_attention + ) + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_dropout_prob = attention_dropout_prob + self.use_stable_embedding = use_stable_embedding + self.shared_input_output_embedding = shared_input_output_embedding + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + +__all__ = ["OpenLlamaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..5e182e0f813f1c0c9ed138c39561cc23eff8cb26 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -0,0 +1,940 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Open-Llama model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from ....modeling_utils import PreTrainedModel +from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ....utils.deprecation import deprecate_kwarg +from .configuration_open_llama import OpenLlamaConfig + + +logger = logging.get_logger(__name__) + +try: + from xformers import ops as xops +except ImportError: + xops = None + + +_CONFIG_FOR_DOC = "OpenLlamaConfig" + + +class OpenLlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + OpenLlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class OpenLlamaRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + cos_cached: torch.Tensor + sin_cached: torch.Tensor + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding): + """OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding): + """OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class OpenLlamaMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + dropout_prob: float, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + self.dropout = nn.Dropout(dropout_prob) + + def forward(self, x): + out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return self.dropout(out) + + +class OpenLlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: OpenLlamaConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + self.dropout_prob = config.attention_dropout_prob + self.rope_theta = config.rope_theta + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = OpenLlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_values is not None: + kv_seq_len += past_key_values[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_values is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_values[0], key_states], dim=2) + value_states = torch.cat([past_key_values[1], value_states], dim=2) + + past_key_values = (key_states, value_states) if use_cache else None + + if self.config.use_memory_efficient_attention and xops is not None and self.training: + attn_weights = None + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + attn_output = xops.memory_efficient_attention( + query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask(), p=self.dropout_prob + ) + else: + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + attn_weights = torch.max( + attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device) + ) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_values + + +class OpenLlamaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: OpenLlamaConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = OpenLlamaAttention(config=config) + self.mlp = OpenLlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + dropout_prob=config.hidden_dropout_prob, + ) + self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +OPEN_LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`OpenLlamaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.", + OPEN_LLAMA_START_DOCSTRING, +) +class OpenLlamaPreTrainedModel(PreTrainedModel): + config: OpenLlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["OpenLlamaDecoderLayer"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + if self.config.use_stable_embedding: + torch.nn.init.xavier_normal_(module.weight.data) + else: + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +OPEN_LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.", + OPEN_LLAMA_START_DOCSTRING, +) +class OpenLlamaModel(OpenLlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OpenLlamaDecoderLayer`] + + Args: + config: OpenLlamaConfig + """ + + def __init__(self, config: OpenLlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + if config.use_stable_embedding: + self.embed_layer_norm = nn.LayerNorm(config.hidden_size) + else: + self.embed_layer_norm = None + self.layers = nn.ModuleList([OpenLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if past_key_values is not None: + past_key_values_length = past_key_values.get_seq_length() + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + if self.embed_layer_norm: + inputs_embeds = self.embed_layer_norm(inputs_embeds) + # embed positions + if self.config.use_memory_efficient_attention and self.training: + attention_mask = None + elif attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + + input_shape = (batch_size, seq_length) + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values[idx] if past_key_values is not None else None, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.model = OpenLlamaModel(config) + if config.shared_input_output_embedding: + self.lm_head = None + else: + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, OpenLlamaForCausalLM + + >>> model = OpenLlamaForCausalLM.from_pretrained("openlm-research/open_llama_7b") + >>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.shared_input_output_embedding: + logits = torch.einsum( + "blh,vh->blv", hidden_states.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight + ) + else: + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids") + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`OpenLlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal + models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + OPEN_LLAMA_START_DOCSTRING, +) +class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = OpenLlamaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = ["OpenLlamaPreTrainedModel", "OpenLlamaModel", "OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..864b321bc2ee3a521e3d6da5403cab7363dea56b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_qdqbert import * + from .modeling_qdqbert import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py new file mode 100644 index 0000000000000000000000000000000000000000..91ac82bc5a0292355ac659548c7c3d4336f2536c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py @@ -0,0 +1,123 @@ +# coding=utf-8 +# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""QDQBERT model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class QDQBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to instantiate an + QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the BERT + [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the QDQBERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`QDQBertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`QDQBertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + + Examples: + + ```python + >>> from transformers import QDQBertModel, QDQBertConfig + + >>> # Initializing a QDQBERT google-bert/bert-base-uncased style configuration + >>> configuration = QDQBertConfig() + + >>> # Initializing a model from the google-bert/bert-base-uncased style configuration + >>> model = QDQBertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qdqbert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + use_cache=True, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + + +__all__ = ["QDQBertConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py new file mode 100644 index 0000000000000000000000000000000000000000..f92bc07a8bfb1c495773be09799a4f06744849f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py @@ -0,0 +1,1736 @@ +# coding=utf-8 +# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch QDQBERT model.""" + +import math +import os +import warnings +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_pytorch_quantization_available, + logging, + replace_return_docstrings, + requires_backends, +) +from ....utils.deprecation import deprecate_kwarg +from .configuration_qdqbert import QDQBertConfig + + +logger = logging.get_logger(__name__) + +# soft dependency +if is_pytorch_quantization_available(): + try: + from pytorch_quantization import nn as quant_nn + from pytorch_quantization.nn.modules.tensor_quantizer import TensorQuantizer + except OSError: + logger.error( + "QDQBERT model are not usable since `pytorch_quantization` can't be loaded. Please try to reinstall it" + " following the instructions here:" + " https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization." + ) + +_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased" +_CONFIG_FOR_DOC = "QDQBertConfig" + + +def load_tf_weights_in_qdqbert(model, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class QDQBertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class QDQBertSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) + self.key = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) + self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + self.matmul_q_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + self.matmul_k_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + self.matmul_v_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + self.matmul_a_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_values is not None: + # reuse k,v, cross_attentions + key_layer = past_key_values[0] + value_layer = past_key_values[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_values is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_values[0], key_layer], dim=2) + value_layer = torch.cat([past_key_values[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul( + self.matmul_q_input_quantizer(query_layer), self.matmul_k_input_quantizer(key_layer.transpose(-1, -2)) + ) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in QDQBertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul( + self.matmul_a_input_quantizer(attention_probs), self.matmul_v_input_quantizer(value_layer) + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_values,) + return outputs + + +class QDQBertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + # Quantize Linear layer + self.dense = quant_nn.QuantLinear(config.hidden_size, config.hidden_size) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # Quantize the inputs to the residual add + self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + # Quantize the inputs to the residual add + add_local = self.add_local_input_quantizer(hidden_states) + add_residual = self.add_residual_input_quantizer(input_tensor) + hidden_states = self.LayerNorm(add_local + add_residual) + return hidden_states + + +# Based on transformers.models.bert.modeling_bert.BertAttention with Bert -> QDQBert +class QDQBertAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = QDQBertSelfAttention(config) + self.output = QDQBertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class QDQBertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + # Quantize Linear layer + self.dense = quant_nn.QuantLinear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class QDQBertOutput(nn.Module): + def __init__(self, config): + super().__init__() + # Quantize Linear layer + self.dense = quant_nn.QuantLinear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # Quantize the inputs to the residual add + self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + # Quantize the inputs to the residual add + add_local = self.add_local_input_quantizer(hidden_states) + add_residual = self.add_residual_input_quantizer(input_tensor) + hidden_states = self.LayerNorm(add_local + add_residual) + return hidden_states + + +# Based on transformers.models.bert.modeling_bert.BertLayer with Bert -> QDQBert +class QDQBertLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.seq_len_dim = 1 + self.attention = QDQBertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = QDQBertAttention(config) + self.intermediate = QDQBertIntermediate(config) + self.output = QDQBertOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_values=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple + cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = self.feed_forward_chunk(attention_output) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Based on transformers.models.bert.modeling_bert.BertEncoder with Bert -> QDQBert +class QDQBertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([QDQBertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values[i] if past_key_values is not None else None, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class QDQBertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class QDQBertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Based on transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert -> QDQBert +class QDQBertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = QDQBertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Based on transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert -> QDQBert +class QDQBertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = QDQBertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class QDQBertOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +# Based on transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert -> QDQBert +class QDQBertPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = QDQBertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +# Based on transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert -> QDQBert +class QDQBertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: QDQBertConfig + load_tf_weights = load_tf_weights_in_qdqbert + base_model_prefix = "bert" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +QDQBERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`QDQBertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +QDQBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare QDQBERT Model transformer outputting raw hidden-states without any specific head on top.", + QDQBERT_START_DOCSTRING, +) +class QDQBertModel(QDQBertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer: bool = True): + requires_backends(self, "pytorch_quantization") + super().__init__(config) + self.config = config + + self.embeddings = QDQBertEmbeddings(config) + self.encoder = QDQBertEncoder(config) + + self.pooler = QDQBertPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING +) +class QDQBertLMHeadModel(QDQBertPreTrainedModel): + _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `QDQBertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.bert = QDQBertModel(config, add_pooling_layer=False) + self.cls = QDQBertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + past_key_values: Optional[tuple[tuple[torch.LongTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, QDQBertLMHeadModel, QDQBertConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + >>> config = QDQBertConfig.from_pretrained("google-bert/bert-base-cased") + >>> config.is_decoder = True + >>> model = QDQBertLMHeadModel.from_pretrained("google-bert/bert-base-cased", config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids: Optional[torch.LongTensor], + past_key_values=None, + attention_mask: Optional[torch.Tensor] = None, + **model_kwargs, + ): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING) +class QDQBertForMaskedLM(QDQBertPreTrainedModel): + _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `QDQBertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = QDQBertModel(config, add_pooling_layer=False) + self.cls = QDQBertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids: torch.LongTensor, attention_mask: Optional[torch.FloatTensor] = None, **model_kwargs + ): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top.""", + QDQBERT_START_DOCSTRING, +) +class QDQBertForNextSentencePrediction(QDQBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = QDQBertModel(config) + self.cls = QDQBertOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, NextSentencePredictorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring). Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, QDQBertForNextSentencePrediction + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + >>> model = QDQBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt") + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ```""" + + if "next_sentence_label" in kwargs: + warnings.warn( + "The `next_sentence_label` argument is deprecated and will be removed in a future version, use" + " `labels` instead.", + FutureWarning, + ) + labels = kwargs.pop("next_sentence_label") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + QDQBERT_START_DOCSTRING, +) +class QDQBertForSequenceClassification(QDQBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.bert = QDQBertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + QDQBERT_START_DOCSTRING, +) +class QDQBertForMultipleChoice(QDQBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = QDQBertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + QDQBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + QDQBERT_START_DOCSTRING, +) +class QDQBertForTokenClassification(QDQBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = QDQBertModel(config, add_pooling_layer=False) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + QDQBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + QDQBERT_START_DOCSTRING, +) +class QDQBertForQuestionAnswering(QDQBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = QDQBertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "QDQBertForMaskedLM", + "QDQBertForMultipleChoice", + "QDQBertForNextSentencePrediction", + "QDQBertForQuestionAnswering", + "QDQBertForSequenceClassification", + "QDQBertForTokenClassification", + "QDQBertLayer", + "QDQBertLMHeadModel", + "QDQBertModel", + "QDQBertPreTrainedModel", + "load_tf_weights_in_qdqbert", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cdfdeb5d179c8e954c9c6822d3f154d632394964 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_realm import * + from .modeling_realm import * + from .retrieval_realm import * + from .tokenization_realm import * + from .tokenization_realm_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57d77c3606cc4e7925f9dc353570e509688d7e2d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/configuration_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/configuration_realm.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf32378a604beca939ff6e0241f69f4c4382120 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/configuration_realm.py @@ -0,0 +1,169 @@ +# coding=utf-8 +# Copyright 2022 The REALM authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""REALM model configuration.""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class RealmConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of + + 1. [`RealmEmbedder`] + 2. [`RealmScorer`] + 3. [`RealmKnowledgeAugEncoder`] + 4. [`RealmRetriever`] + 5. [`RealmReader`] + 6. [`RealmForOpenQA`] + + It is used to instantiate an REALM model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the REALM + [google/realm-cc-news-pretrained-embedder](https://huggingface.co/google/realm-cc-news-pretrained-embedder) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or + [`RealmReader`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + retriever_proj_size (`int`, *optional*, defaults to 128): + Dimension of the retriever(embedder) projection. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_candidates (`int`, *optional*, defaults to 8): + Number of candidates inputted to the RealmScorer or RealmKnowledgeAugEncoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`], + [`RealmKnowledgeAugEncoder`], or [`RealmReader`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + span_hidden_size (`int`, *optional*, defaults to 256): + Dimension of the reader's spans. + max_span_width (`int`, *optional*, defaults to 10): + Max span width of the reader. + reader_layer_norm_eps (`float`, *optional*, defaults to 1e-3): + The epsilon used by the reader's layer normalization layers. + reader_beam_size (`int`, *optional*, defaults to 5): + Beam size of the reader. + reader_seq_len (`int`, *optional*, defaults to 288+32): + Maximum sequence length of the reader. + num_block_records (`int`, *optional*, defaults to 13353718): + Number of block records. + searcher_beam_size (`int`, *optional*, defaults to 5000): + Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as + *reader_beam_size*. + + Example: + + ```python + >>> from transformers import RealmConfig, RealmEmbedder + + >>> # Initializing a REALM realm-cc-news-pretrained-* style configuration + >>> configuration = RealmConfig() + + >>> # Initializing a model (with random weights) from the google/realm-cc-news-pretrained-embedder style configuration + >>> model = RealmEmbedder(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "realm" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + retriever_proj_size=128, + num_hidden_layers=12, + num_attention_heads=12, + num_candidates=8, + intermediate_size=3072, + hidden_act="gelu_new", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + span_hidden_size=256, + max_span_width=10, + reader_layer_norm_eps=1e-3, + reader_beam_size=5, + reader_seq_len=320, # 288 + 32 + num_block_records=13353718, + searcher_beam_size=5000, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + # Common config + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.retriever_proj_size = retriever_proj_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_candidates = num_candidates + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + + # Reader config + self.span_hidden_size = span_hidden_size + self.max_span_width = max_span_width + self.reader_layer_norm_eps = reader_layer_norm_eps + self.reader_beam_size = reader_beam_size + self.reader_seq_len = reader_seq_len + + # Retrieval config + self.num_block_records = num_block_records + self.searcher_beam_size = searcher_beam_size + + +__all__ = ["RealmConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/modeling_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/modeling_realm.py new file mode 100644 index 0000000000000000000000000000000000000000..9e2de5c9c1c4491390a0a31ce9c0613e9500096b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/modeling_realm.py @@ -0,0 +1,1855 @@ +# coding=utf-8 +# Copyright 2022 The REALM authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch REALM model.""" + +import math +import os +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + MaskedLMOutput, + ModelOutput, +) +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ....utils.deprecation import deprecate_kwarg +from .configuration_realm import RealmConfig + + +logger = logging.get_logger(__name__) +_EMBEDDER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-embedder" +_ENCODER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-encoder" +_SCORER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-scorer" +_CONFIG_FOR_DOC = "RealmConfig" + + +def load_tf_weights_in_realm(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + if isinstance(model, RealmReader) and "reader" not in name: + logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter") + continue + + # For pretrained openqa reader + if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmForOpenQA): + name = name.replace("bert/", "reader/realm/") + name = name.replace("cls/", "reader/cls/") + + # For pretrained encoder + if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmKnowledgeAugEncoder): + name = name.replace("bert/", "realm/") + + # For finetuned reader + if name.startswith("reader"): + reader_prefix = "" if isinstance(model, RealmReader) else "reader/" + name = name.replace("reader/module/bert/", f"{reader_prefix}realm/") + name = name.replace("reader/module/cls/", f"{reader_prefix}cls/") + name = name.replace("reader/dense/", f"{reader_prefix}qa_outputs/dense_intermediate/") + name = name.replace("reader/dense_1/", f"{reader_prefix}qa_outputs/dense_output/") + name = name.replace("reader/layer_normalization", f"{reader_prefix}qa_outputs/layer_normalization") + + # For embedder and scorer + if name.startswith("module/module/module/"): # finetuned + embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/" + name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}realm/") + name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/") + name = name.replace("module/module/module/dense/", f"{embedder_prefix}cls/dense/") + name = name.replace("module/module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/") + name = name.replace("module/module/module/bert/", f"{embedder_prefix}realm/") + name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/") + elif name.startswith("module/module/"): # pretrained + embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/" + name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/") + name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/") + + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + assert pointer.shape == array.shape, ( + f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" + ) + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class RealmEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class RealmSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_values is not None: + # reuse k,v, cross_attentions + key_layer = past_key_values[0] + value_layer = past_key_values[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_values is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_values[0], key_layer], dim=2) + value_layer = torch.cat([past_key_values[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_values is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in RealmModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_values,) + return outputs + + +class RealmSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +REALM_SELF_ATTENTION_CLASSES = { + "eager": RealmSelfAttention, +} + + +class RealmAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = RealmSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class RealmIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class RealmOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class RealmLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = RealmAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = RealmAttention(config, position_embedding_type="absolute") + self.intermediate = RealmIntermediate(config) + self.output = RealmOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_values=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple + cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class RealmEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([RealmLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values[i] if past_key_values is not None else None, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class RealmPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@dataclass +class RealmEmbedderOutput(ModelOutput): + """ + Outputs of [`RealmEmbedder`] models. + + Args: + projected_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`): + + Projected score. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + projected_score: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class RealmScorerOutput(ModelOutput): + """ + Outputs of [`RealmScorer`] models. + + Args: + relevance_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates)`): + The relevance score of document candidates (before softmax). + query_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`): + Query score derived from the query embedder. + candidate_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates, config.retriever_proj_size)`): + Candidate score derived from the embedder. + """ + + relevance_score: Optional[torch.FloatTensor] = None + query_score: Optional[torch.FloatTensor] = None + candidate_score: Optional[torch.FloatTensor] = None + + +@dataclass +class RealmReaderOutput(ModelOutput): + """ + Outputs of [`RealmReader`] models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided): + Total loss. + retriever_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided): + Retriever loss. + reader_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided): + Reader loss. + retriever_correct (`torch.BoolTensor` of shape `(config.searcher_beam_size,)`, *optional*): + Whether or not an evidence block contains answer. + reader_correct (`torch.BoolTensor` of shape `(config.reader_beam_size, num_candidates)`, *optional*): + Whether or not a span candidate contains answer. + block_idx (`torch.LongTensor` of shape `()`): + The index of the retrieved evidence block in which the predicted answer is most likely. + candidate (`torch.LongTensor` of shape `()`): + The index of the retrieved span candidates in which the predicted answer is most likely. + start_pos (`torch.IntTensor` of shape `()`): + Predicted answer starting position in *RealmReader*'s inputs. + end_pos (`torch.IntTensor` of shape `()`): + Predicted answer ending position in *RealmReader*'s inputs. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + retriever_loss: Optional[torch.FloatTensor] = None + reader_loss: Optional[torch.FloatTensor] = None + retriever_correct: Optional[torch.BoolTensor] = None + reader_correct: Optional[torch.BoolTensor] = None + block_idx: Optional[torch.LongTensor] = None + candidate: Optional[torch.LongTensor] = None + start_pos: Optional[torch.IntTensor] = None + end_pos: Optional[torch.IntTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class RealmForOpenQAOutput(ModelOutput): + """ + + Outputs of [`RealmForOpenQA`] models. + + Args: + reader_output (`dict`): + Reader output. + predicted_answer_ids (`torch.LongTensor` of shape `(answer_sequence_length)`): + Predicted answer ids. + """ + + reader_output: Optional[dict] = None + predicted_answer_ids: Optional[torch.LongTensor] = None + + +class RealmPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class RealmLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = RealmPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class RealmOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = RealmLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class RealmScorerProjection(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = RealmLMPredictionHead(config) + self.dense = nn.Linear(config.hidden_size, config.retriever_proj_size) + self.LayerNorm = nn.LayerNorm(config.retriever_proj_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class RealmReaderProjection(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.dense_intermediate = nn.Linear(config.hidden_size, config.span_hidden_size * 2) + self.dense_output = nn.Linear(config.span_hidden_size, 1) + self.layer_normalization = nn.LayerNorm(config.span_hidden_size, eps=config.reader_layer_norm_eps) + self.relu = nn.ReLU() + + def forward(self, hidden_states, block_mask): + def span_candidates(masks): + """ + Generate span candidates. + + Args: + masks: [num_retrievals, max_sequence_len] + + Returns: + starts: [num_spans] ends: [num_spans] span_masks: [num_retrievals, num_spans] + whether spans locate in evidence block. + """ + _, max_sequence_len = masks.shape + + def _spans_given_width(width): + current_starts = torch.arange(max_sequence_len - width + 1, device=masks.device) + current_ends = torch.arange(width - 1, max_sequence_len, device=masks.device) + return current_starts, current_ends + + starts, ends = zip(*(_spans_given_width(w + 1) for w in range(self.config.max_span_width))) + + # [num_spans] + starts = torch.cat(starts, 0) + ends = torch.cat(ends, 0) + + # [num_retrievals, num_spans] + start_masks = torch.index_select(masks, dim=-1, index=starts) + end_masks = torch.index_select(masks, dim=-1, index=ends) + span_masks = start_masks * end_masks + + return starts, ends, span_masks + + def mask_to_score(mask, dtype=torch.float32): + return (1.0 - mask.type(dtype)) * torch.finfo(dtype).min + + # [reader_beam_size, max_sequence_len, span_hidden_size * 2] + hidden_states = self.dense_intermediate(hidden_states) + # [reader_beam_size, max_sequence_len, span_hidden_size] + start_projection, end_projection = hidden_states.chunk(2, dim=-1) + + candidate_starts, candidate_ends, candidate_mask = span_candidates(block_mask) + + candidate_start_projections = torch.index_select(start_projection, dim=1, index=candidate_starts) + candidate_end_projections = torch.index_select(end_projection, dim=1, index=candidate_ends) + candidate_hidden = candidate_start_projections + candidate_end_projections + + # [reader_beam_size, num_candidates, span_hidden_size] + candidate_hidden = self.relu(candidate_hidden) + # [reader_beam_size, num_candidates, span_hidden_size] + candidate_hidden = self.layer_normalization(candidate_hidden) + # [reader_beam_size, num_candidates] + reader_logits = self.dense_output(candidate_hidden).squeeze(-1) + # [reader_beam_size, num_candidates] + reader_logits += mask_to_score(candidate_mask, dtype=reader_logits.dtype) + + return reader_logits, candidate_starts, candidate_ends + + +REALM_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`RealmConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +REALM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class RealmPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: RealmConfig + load_tf_weights = load_tf_weights_in_realm + base_model_prefix = "realm" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _flatten_inputs(self, *inputs): + """Flatten inputs' shape to (-1, input_shape[-1])""" + flattened_inputs = [] + for tensor in inputs: + if tensor is None: + flattened_inputs.append(None) + else: + input_shape = tensor.shape + if len(input_shape) > 2: + tensor = tensor.view((-1, input_shape[-1])) + flattened_inputs.append(tensor) + return flattened_inputs + + +class RealmBertModel(RealmPreTrainedModel): + """ + Same as the original BertModel but remove docstrings. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = RealmEmbeddings(config) + self.encoder = RealmEncoder(config) + + self.pooler = RealmPooler(config) if add_pooling_layer else None + + # Weights initialization is mostly managed by other Realm models, + # but we also have them initialized here to keep a consistency. + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The embedder of REALM outputting projected score that will be used to calculate relevance score.", + REALM_START_DOCSTRING, +) +class RealmEmbedder(RealmPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.realm = RealmBertModel(self.config) + self.cls = RealmScorerProjection(self.config) + self.post_init() + + def get_input_embeddings(self): + return self.realm.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.realm.embeddings.word_embeddings = value + + @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=RealmEmbedderOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, RealmEmbedderOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, RealmEmbedder + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-embedder") + >>> model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> projected_score = outputs.projected_score + ``` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + realm_outputs = self.realm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # [batch_size, hidden_size] + pooler_output = realm_outputs[1] + # [batch_size, retriever_proj_size] + projected_score = self.cls(pooler_output) + + if not return_dict: + return (projected_score,) + realm_outputs[2:4] + else: + return RealmEmbedderOutput( + projected_score=projected_score, + hidden_states=realm_outputs.hidden_states, + attentions=realm_outputs.attentions, + ) + + +@add_start_docstrings( + "The scorer of REALM outputting relevance scores representing the score of document candidates (before softmax).", + REALM_START_DOCSTRING, +) +class RealmScorer(RealmPreTrainedModel): + r""" + Args: + query_embedder ([`RealmEmbedder`]): + Embedder for input sequences. If not specified, it will use the same embedder as candidate sequences. + """ + + def __init__(self, config, query_embedder=None): + super().__init__(config) + + self.embedder = RealmEmbedder(self.config) + + self.query_embedder = query_embedder if query_embedder is not None else self.embedder + + self.post_init() + + @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=RealmScorerOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + candidate_input_ids: Optional[torch.LongTensor] = None, + candidate_attention_mask: Optional[torch.FloatTensor] = None, + candidate_token_type_ids: Optional[torch.LongTensor] = None, + candidate_inputs_embeds: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, RealmScorerOutput]: + r""" + candidate_input_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`): + Indices of candidate input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + candidate_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + candidate_token_type_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + candidate_inputs_embeds (`torch.FloatTensor` of shape `(batch_size * num_candidates, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `candidate_input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert *candidate_input_ids* indices + into associated vectors than the model's internal embedding lookup matrix. + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, RealmScorer + + >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-scorer") + >>> model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer", num_candidates=2) + + >>> # batch_size = 2, num_candidates = 2 + >>> input_texts = ["How are you?", "What is the item in the picture?"] + >>> candidates_texts = [["Hello world!", "Nice to meet you!"], ["A cute cat.", "An adorable dog."]] + + >>> inputs = tokenizer(input_texts, return_tensors="pt") + >>> candidates_inputs = tokenizer.batch_encode_candidates(candidates_texts, max_length=10, return_tensors="pt") + + >>> outputs = model( + ... **inputs, + ... candidate_input_ids=candidates_inputs.input_ids, + ... candidate_attention_mask=candidates_inputs.attention_mask, + ... candidate_token_type_ids=candidates_inputs.token_type_ids, + ... ) + >>> relevance_score = outputs.relevance_score + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or input_embeds.") + + if candidate_input_ids is None and candidate_inputs_embeds is None: + raise ValueError("You have to specify either candidate_input_ids or candidate_inputs_embeds.") + + query_outputs = self.query_embedder( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # [batch_size * num_candidates, candidate_seq_len] + (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs( + candidate_input_ids, candidate_attention_mask, candidate_token_type_ids + ) + + candidate_outputs = self.embedder( + flattened_input_ids, + attention_mask=flattened_attention_mask, + token_type_ids=flattened_token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=candidate_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # [batch_size, retriever_proj_size] + query_score = query_outputs[0] + # [batch_size * num_candidates, retriever_proj_size] + candidate_score = candidate_outputs[0] + # [batch_size, num_candidates, retriever_proj_size] + candidate_score = candidate_score.view(-1, self.config.num_candidates, self.config.retriever_proj_size) + # [batch_size, num_candidates] + relevance_score = torch.einsum("bd,bnd->bn", query_score, candidate_score) + + if not return_dict: + return relevance_score, query_score, candidate_score + + return RealmScorerOutput( + relevance_score=relevance_score, query_score=query_score, candidate_score=candidate_score + ) + + +@add_start_docstrings( + "The knowledge-augmented encoder of REALM outputting masked language model logits and marginal log-likelihood" + " loss.", + REALM_START_DOCSTRING, +) +class RealmKnowledgeAugEncoder(RealmPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder"] + + def __init__(self, config): + super().__init__(config) + self.realm = RealmBertModel(self.config) + self.cls = RealmOnlyMLMHead(self.config) + self.post_init() + + def get_input_embeddings(self): + return self.realm.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.realm.embeddings.word_embeddings = value + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward( + REALM_INPUTS_DOCSTRING.format("batch_size, num_candidates, sequence_length") + ) + @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + relevance_score: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + mlm_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + relevance_score (`torch.FloatTensor` of shape `(batch_size, num_candidates)`, *optional*): + Relevance score derived from RealmScorer, must be specified if you want to compute the masked language + modeling loss. + + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + mlm_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked. + Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, RealmKnowledgeAugEncoder + + >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder") + >>> model = RealmKnowledgeAugEncoder.from_pretrained( + ... "google/realm-cc-news-pretrained-encoder", num_candidates=2 + ... ) + + >>> # batch_size = 2, num_candidates = 2 + >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]] + + >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None and relevance_score is None: + raise ValueError( + "You have to specify `relevance_score` when `labels` is specified in order to compute loss." + ) + + (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs( + input_ids, attention_mask, token_type_ids + ) + + joint_outputs = self.realm( + flattened_input_ids, + attention_mask=flattened_attention_mask, + token_type_ids=flattened_token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # [batch_size * num_candidates, joint_seq_len, hidden_size] + joint_output = joint_outputs[0] + # [batch_size * num_candidates, joint_seq_len, vocab_size] + prediction_scores = self.cls(joint_output) + # [batch_size, num_candidates] + candidate_score = relevance_score + + masked_lm_loss = None + if labels is not None: + batch_size, seq_length = labels.size() + + if mlm_mask is None: + mlm_mask = torch.ones_like(labels, dtype=torch.float32) + else: + mlm_mask = mlm_mask.type(torch.float32) + + # Compute marginal log-likelihood + loss_fct = CrossEntropyLoss(reduction="none") # -100 index = padding token + + # [batch_size * num_candidates * joint_seq_len, vocab_size] + mlm_logits = prediction_scores.view(-1, self.config.vocab_size) + # [batch_size * num_candidates * joint_seq_len] + mlm_targets = labels.tile(1, self.config.num_candidates).view(-1) + # [batch_size, num_candidates, joint_seq_len] + masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view( + batch_size, self.config.num_candidates, seq_length + ) + # [batch_size, num_candidates, 1] + candidate_log_prob = candidate_score.log_softmax(-1).unsqueeze(-1) + # [batch_size, num_candidates, joint_seq_len] + joint_gold_log_prob = candidate_log_prob + masked_lm_log_prob + # [batch_size, joint_seq_len] + marginal_gold_log_probs = joint_gold_log_prob.logsumexp(1) + # [] + masked_lm_loss = -torch.nansum(torch.sum(marginal_gold_log_probs * mlm_mask) / torch.sum(mlm_mask)) + + if not return_dict: + output = (prediction_scores,) + joint_outputs[2:4] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=joint_outputs.hidden_states, + attentions=joint_outputs.attentions, + ) + + +@add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING) +class RealmReader(RealmPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.realm = RealmBertModel(config) + self.cls = RealmOnlyMLMHead(config) + self.qa_outputs = RealmReaderProjection(config) + + self.post_init() + + @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("reader_beam_size, sequence_length")) + @replace_return_docstrings(output_type=RealmReaderOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + relevance_score: Optional[torch.FloatTensor] = None, + block_mask: Optional[torch.BoolTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + has_answers: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, RealmReaderOutput]: + r""" + relevance_score (`torch.FloatTensor` of shape `(searcher_beam_size,)`, *optional*): + Relevance score, which must be specified if you want to compute the logits and marginal log loss. + block_mask (`torch.BoolTensor` of shape `(searcher_beam_size, sequence_length)`, *optional*): + The mask of the evidence block, which must be specified if you want to compute the logits and marginal log + loss. + start_positions (`torch.LongTensor` of shape `(searcher_beam_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(searcher_beam_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + has_answers (`torch.BoolTensor` of shape `(searcher_beam_size,)`, *optional*): + Whether or not the evidence block has answer(s). + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if relevance_score is None: + raise ValueError("You have to specify `relevance_score` to calculate logits and loss.") + if block_mask is None: + raise ValueError("You have to specify `block_mask` to separate question block and evidence block.") + if token_type_ids.size(1) < self.config.max_span_width: + raise ValueError("The input sequence length must be greater than or equal to config.max_span_width.") + outputs = self.realm( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # [reader_beam_size, joint_seq_len, hidden_size] + sequence_output = outputs[0] + + # [reader_beam_size, num_candidates], [num_candidates], [num_candidates] + reader_logits, candidate_starts, candidate_ends = self.qa_outputs( + sequence_output, block_mask[0 : self.config.reader_beam_size] + ) + # [searcher_beam_size, 1] + retriever_logits = torch.unsqueeze(relevance_score[0 : self.config.reader_beam_size], -1) + # [reader_beam_size, num_candidates] + reader_logits += retriever_logits + # [] + predicted_block_index = torch.argmax(torch.max(reader_logits, dim=1).values) + # [] + predicted_candidate = torch.argmax(torch.max(reader_logits, dim=0).values) + # [1] + predicted_start = torch.index_select(candidate_starts, dim=0, index=predicted_candidate) + # [1] + predicted_end = torch.index_select(candidate_ends, dim=0, index=predicted_candidate) + + total_loss = None + retriever_loss = None + reader_loss = None + retriever_correct = None + reader_correct = None + if start_positions is not None and end_positions is not None and has_answers is not None: + + def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts, gold_ends): + """Compute correct span.""" + # [reader_beam_size, num_answers, num_candidates] + is_gold_start = torch.eq( + torch.unsqueeze(torch.unsqueeze(candidate_starts, 0), 0), torch.unsqueeze(gold_starts, -1) + ) + is_gold_end = torch.eq( + torch.unsqueeze(torch.unsqueeze(candidate_ends, 0), 0), torch.unsqueeze(gold_ends, -1) + ) + + # [reader_beam_size, num_candidates] + return torch.any(torch.logical_and(is_gold_start, is_gold_end), 1) + + def marginal_log_loss(logits, is_correct): + """Loss based on the negative marginal log-likelihood.""" + + def mask_to_score(mask, dtype=torch.float32): + return (1.0 - mask.type(dtype)) * torch.finfo(dtype).min + + # [] + log_numerator = torch.logsumexp(logits + mask_to_score(is_correct, dtype=logits.dtype), dim=-1) + log_denominator = torch.logsumexp(logits, dim=-1) + return log_denominator - log_numerator + + # sometimes the start/end positions are outside our model inputs, we ignore these terms + # `-1` is reserved for no answer. + ignored_index = sequence_output.size(1) + start_positions = start_positions.clamp(-1, ignored_index) + end_positions = end_positions.clamp(-1, ignored_index) + + retriever_correct = has_answers + any_retriever_correct = torch.any(retriever_correct) + + reader_correct = compute_correct_candidates( + candidate_starts=candidate_starts, + candidate_ends=candidate_ends, + gold_starts=start_positions[0 : self.config.reader_beam_size], + gold_ends=end_positions[0 : self.config.reader_beam_size], + ) + any_reader_correct = torch.any(reader_correct) + + retriever_loss = marginal_log_loss(relevance_score, retriever_correct) + reader_loss = marginal_log_loss(reader_logits.view(-1), reader_correct.view(-1)) + retriever_loss *= any_retriever_correct.type(torch.float32) + reader_loss *= any_reader_correct.type(torch.float32) + + total_loss = (retriever_loss + reader_loss).mean() + + if not return_dict: + output = (predicted_block_index, predicted_candidate, predicted_start, predicted_end) + outputs[2:] + return ( + ((total_loss, retriever_loss, reader_loss, retriever_correct, reader_correct) + output) + if total_loss is not None + else output + ) + + return RealmReaderOutput( + loss=total_loss, + retriever_loss=retriever_loss, + reader_loss=reader_loss, + retriever_correct=retriever_correct, + reader_correct=reader_correct, + block_idx=predicted_block_index, + candidate=predicted_candidate, + start_pos=predicted_start, + end_pos=predicted_end, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +REALM_FOR_OPEN_QA_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token (should not be used in this model by design). + + [What are token type IDs?](../glossary#token-type-ids) + answer_ids (`list` of shape `(num_answers, answer_length)`, *optional*): + Answer ids for computing the marginal log-likelihood loss. Indices should be in `[-1, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-1` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "`RealmForOpenQA` for end-to-end open domain question answering.", + REALM_START_DOCSTRING, +) +class RealmForOpenQA(RealmPreTrainedModel): + def __init__(self, config, retriever=None): + super().__init__(config) + self.embedder = RealmEmbedder(config) + self.reader = RealmReader(config) + self.register_buffer( + "block_emb", + torch.zeros(()).new_empty( + size=(config.num_block_records, config.retriever_proj_size), + dtype=torch.float32, + device=torch.device("cpu"), + ), + ) + self.retriever = retriever + + self.post_init() + + @property + def searcher_beam_size(self): + if self.training: + return self.config.searcher_beam_size + return self.config.reader_beam_size + + def block_embedding_to(self, device): + """Send `self.block_emb` to a specific device. + + Args: + device (`str` or `torch.device`): + The device to which `self.block_emb` will be sent. + """ + + self.block_emb = self.block_emb.to(device) + + @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING.format("1, sequence_length")) + @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor], + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + answer_ids: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, RealmForOpenQAOutput]: + r""" + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import RealmForOpenQA, RealmRetriever, AutoTokenizer + + >>> retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa") + >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-orqa-nq-openqa") + >>> model = RealmForOpenQA.from_pretrained("google/realm-orqa-nq-openqa", retriever=retriever) + + >>> question = "Who is the pioneer in modern computer science?" + >>> question_ids = tokenizer([question], return_tensors="pt") + >>> answer_ids = tokenizer( + ... ["alan mathison turing"], + ... add_special_tokens=False, + ... return_token_type_ids=False, + ... return_attention_mask=False, + ... ).input_ids + + >>> reader_output, predicted_answer_ids = model(**question_ids, answer_ids=answer_ids, return_dict=False) + >>> predicted_answer = tokenizer.decode(predicted_answer_ids) + >>> loss = reader_output.loss + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and input_ids.shape[0] != 1: + raise ValueError("The batch_size of the inputs must be 1.") + + question_outputs = self.embedder( + input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True + ) + # [1, projection_size] + question_projection = question_outputs[0] + + # CPU computation starts. + # [1, block_emb_size] + batch_scores = torch.einsum("BD,QD->QB", self.block_emb, question_projection.to(self.block_emb.device)) + # [1, searcher_beam_size] + _, retrieved_block_ids = torch.topk(batch_scores, k=self.searcher_beam_size, dim=-1) + # [searcher_beam_size] + retrieved_block_ids = retrieved_block_ids.squeeze() + # [searcher_beam_size, projection_size] + retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids) + # CPU computation ends. + + # Retrieve possible answers + has_answers, start_pos, end_pos, concat_inputs = self.retriever( + retrieved_block_ids.cpu(), input_ids, answer_ids, max_length=self.config.reader_seq_len + ) + + concat_inputs = concat_inputs.to(self.reader.device) + block_mask = concat_inputs.special_tokens_mask.type(torch.bool).to(device=self.reader.device) + block_mask.logical_not_().logical_and_(concat_inputs.token_type_ids.type(torch.bool)) + + if has_answers is not None: + has_answers = torch.tensor(has_answers, dtype=torch.bool, device=self.reader.device) + start_pos = torch.tensor(start_pos, dtype=torch.long, device=self.reader.device) + end_pos = torch.tensor(end_pos, dtype=torch.long, device=self.reader.device) + + # [searcher_beam_size] + retrieved_logits = torch.einsum( + "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(self.reader.device) + ) + + reader_output = self.reader( + input_ids=concat_inputs.input_ids[0 : self.config.reader_beam_size], + attention_mask=concat_inputs.attention_mask[0 : self.config.reader_beam_size], + token_type_ids=concat_inputs.token_type_ids[0 : self.config.reader_beam_size], + relevance_score=retrieved_logits, + block_mask=block_mask, + has_answers=has_answers, + start_positions=start_pos, + end_positions=end_pos, + return_dict=True, + ) + + predicted_block = concat_inputs.input_ids[reader_output.block_idx] + predicted_answer_ids = predicted_block[reader_output.start_pos : reader_output.end_pos + 1] + + if not return_dict: + return reader_output, predicted_answer_ids + + return RealmForOpenQAOutput( + reader_output=reader_output, + predicted_answer_ids=predicted_answer_ids, + ) + + +__all__ = [ + "RealmEmbedder", + "RealmForOpenQA", + "RealmKnowledgeAugEncoder", + "RealmPreTrainedModel", + "RealmReader", + "RealmScorer", + "load_tf_weights_in_realm", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/retrieval_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/retrieval_realm.py new file mode 100644 index 0000000000000000000000000000000000000000..b5e47abb11791aedccd5621677713136a037be8c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/retrieval_realm.py @@ -0,0 +1,176 @@ +# coding=utf-8 +# Copyright 2022 The REALM authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""REALM Retriever model implementation.""" + +import os +from typing import Optional, Union + +import numpy as np +from huggingface_hub import hf_hub_download + +from transformers import AutoTokenizer + +from ....utils import logging, strtobool + + +_REALM_BLOCK_RECORDS_FILENAME = "block_records.npy" + + +logger = logging.get_logger(__name__) + + +def convert_tfrecord_to_np(block_records_path: str, num_block_records: int) -> np.ndarray: + import tensorflow.compat.v1 as tf + + blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024) + blocks_dataset = blocks_dataset.batch(num_block_records, drop_remainder=True) + np_record = next(blocks_dataset.take(1).as_numpy_iterator()) + + return np_record + + +class ScaNNSearcher: + """Note that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included.""" + + def __init__( + self, + db, + num_neighbors, + dimensions_per_block=2, + num_leaves=1000, + num_leaves_to_search=100, + training_sample_size=100000, + ): + """Build scann searcher.""" + + from scann.scann_ops.py.scann_ops_pybind import builder as Builder + + builder = Builder(db=db, num_neighbors=num_neighbors, distance_measure="dot_product") + builder = builder.tree( + num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=training_sample_size + ) + builder = builder.score_ah(dimensions_per_block=dimensions_per_block) + + self.searcher = builder.build() + + def search_batched(self, question_projection): + retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu()) + return retrieved_block_ids.astype("int64") + + +class RealmRetriever: + """The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer + positions." + + Parameters: + block_records (`np.ndarray`): + A numpy array which contains evidence texts. + tokenizer ([`RealmTokenizer`]): + The tokenizer to encode retrieved texts. + """ + + def __init__(self, block_records, tokenizer): + super().__init__() + self.block_records = block_records + self.tokenizer = tokenizer + + def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, max_length=None, return_tensors="pt"): + retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0) + + question = self.tokenizer.decode(question_input_ids[0], skip_special_tokens=True) + + text = [] + text_pair = [] + for retrieved_block in retrieved_blocks: + text.append(question) + text_pair.append(retrieved_block.decode()) + + concat_inputs = self.tokenizer( + text, text_pair, padding=True, truncation=True, return_special_tokens_mask=True, max_length=max_length + ) + concat_inputs_tensors = concat_inputs.convert_to_tensors(return_tensors) + + if answer_ids is not None: + return self.block_has_answer(concat_inputs, answer_ids) + (concat_inputs_tensors,) + else: + return (None, None, None, concat_inputs_tensors) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *init_inputs, **kwargs): + if os.path.isdir(pretrained_model_name_or_path): + block_records_path = os.path.join(pretrained_model_name_or_path, _REALM_BLOCK_RECORDS_FILENAME) + else: + block_records_path = hf_hub_download( + repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs + ) + if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")): + raise ValueError( + "This part uses `pickle.load` which is insecure and will execute arbitrary code that is " + "potentially malicious. It's recommended to never unpickle data that could have come from an " + "untrusted source, or that could have been tampered with. If you already verified the pickle " + "data and decided to use it, you can set the environment variable " + "`TRUST_REMOTE_CODE` to `True` to allow it." + ) + block_records = np.load(block_records_path, allow_pickle=True) + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + + return cls(block_records, tokenizer) + + def save_pretrained(self, save_directory): + # save block records + np.save(os.path.join(save_directory, _REALM_BLOCK_RECORDS_FILENAME), self.block_records) + # save tokenizer + self.tokenizer.save_pretrained(save_directory) + + def block_has_answer(self, concat_inputs, answer_ids): + """check if retrieved_blocks has answers.""" + has_answers = [] + start_pos = [] + end_pos = [] + max_answers = 0 + + for input_id in concat_inputs.input_ids: + input_id_list = input_id.tolist() + # Check answers between two [SEP] tokens + first_sep_idx = input_id_list.index(self.tokenizer.sep_token_id) + second_sep_idx = first_sep_idx + 1 + input_id_list[first_sep_idx + 1 :].index(self.tokenizer.sep_token_id) + + start_pos.append([]) + end_pos.append([]) + for answer in answer_ids: + for idx in range(first_sep_idx + 1, second_sep_idx): + if answer[0] == input_id_list[idx]: + if input_id_list[idx : idx + len(answer)] == answer: + start_pos[-1].append(idx) + end_pos[-1].append(idx + len(answer) - 1) + + if len(start_pos[-1]) == 0: + has_answers.append(False) + else: + has_answers.append(True) + if len(start_pos[-1]) > max_answers: + max_answers = len(start_pos[-1]) + + # Pad -1 to max_answers + for start_pos_, end_pos_ in zip(start_pos, end_pos): + if len(start_pos_) < max_answers: + padded = [-1] * (max_answers - len(start_pos_)) + start_pos_ += padded + end_pos_ += padded + return has_answers, start_pos, end_pos + + +__all__ = ["RealmRetriever"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm.py new file mode 100644 index 0000000000000000000000000000000000000000..af7cdc1a5bac3afea8ed0651ae2892adcc03cbc3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm.py @@ -0,0 +1,534 @@ +# coding=utf-8 +# Copyright 2022 The REALM authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for REALM.""" + +import collections +import os +import unicodedata +from typing import Optional + +from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ....tokenization_utils_base import BatchEncoding +from ....utils import PaddingStrategy, logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class RealmTokenizer(PreTrainedTokenizer): + r""" + Construct a REALM tokenizer. + + [`RealmTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting and + wordpiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = RealmTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def batch_encode_candidates(self, text, **kwargs): + r""" + Encode a batch of text or text pair. This method is similar to regular __call__ method but has the following + differences: + + 1. Handle additional num_candidate axis. (batch_size, num_candidates, text) + 2. Always pad the sequences to *max_length*. + 3. Must specify *max_length* in order to stack packs of candidates into a batch. + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + text (`List[List[str]]`): + The batch of sequences to be encoded. Each sequence must be in this format: (batch_size, + num_candidates, text). + text_pair (`List[List[str]]`, *optional*): + The batch of sequences to be encoded. Each sequence must be in this format: (batch_size, + num_candidates, text). + **kwargs: + Keyword arguments of the __call__ method. + + Returns: + [`BatchEncoding`]: Encoded text or text pair. + + Example: + + ```python + >>> from transformers import RealmTokenizer + + >>> # batch_size = 2, num_candidates = 2 + >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]] + + >>> tokenizer = RealmTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder") + >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt") + ```""" + + # Always using a fixed sequence length to encode in order to stack candidates into a batch. + kwargs["padding"] = PaddingStrategy.MAX_LENGTH + + batch_text = text + batch_text_pair = kwargs.pop("text_pair", None) + return_tensors = kwargs.pop("return_tensors", None) + + output_data = { + "input_ids": [], + "attention_mask": [], + "token_type_ids": [], + } + + for idx, candidate_text in enumerate(batch_text): + if batch_text_pair is not None: + candidate_text_pair = batch_text_pair[idx] + else: + candidate_text_pair = None + + encoded_candidates = super().__call__(candidate_text, candidate_text_pair, return_tensors=None, **kwargs) + + encoded_input_ids = encoded_candidates.get("input_ids") + encoded_attention_mask = encoded_candidates.get("attention_mask") + encoded_token_type_ids = encoded_candidates.get("token_type_ids") + + if encoded_input_ids is not None: + output_data["input_ids"].append(encoded_input_ids) + if encoded_attention_mask is not None: + output_data["attention_mask"].append(encoded_attention_mask) + if encoded_token_type_ids is not None: + output_data["token_type_ids"].append(encoded_token_type_ids) + + output_data = {key: item for key, item in output_data.items() if len(item) != 0} + + return BatchEncoding(output_data, tensor_type=return_tensors) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A REALM sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see + WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["RealmTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..53a1d99e0ac804ae196e9c65f0e2979dcb2401c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py @@ -0,0 +1,223 @@ +# coding=utf-8 +# Copyright 2022 The REALM authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization classes for REALM.""" + +import json +from typing import Optional + +from tokenizers import normalizers + +from ....tokenization_utils_base import BatchEncoding +from ....tokenization_utils_fast import PreTrainedTokenizerFast +from ....utils import PaddingStrategy, logging +from .tokenization_realm import RealmTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +class RealmTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" REALM tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. + + [`RealmTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation + splitting and wordpiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = RealmTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + def batch_encode_candidates(self, text, **kwargs): + r""" + Encode a batch of text or text pair. This method is similar to regular __call__ method but has the following + differences: + + 1. Handle additional num_candidate axis. (batch_size, num_candidates, text) + 2. Always pad the sequences to *max_length*. + 3. Must specify *max_length* in order to stack packs of candidates into a batch. + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + text (`List[List[str]]`): + The batch of sequences to be encoded. Each sequence must be in this format: (batch_size, + num_candidates, text). + text_pair (`List[List[str]]`, *optional*): + The batch of sequences to be encoded. Each sequence must be in this format: (batch_size, + num_candidates, text). + **kwargs: + Keyword arguments of the __call__ method. + + Returns: + [`BatchEncoding`]: Encoded text or text pair. + + Example: + + ```python + >>> from transformers import RealmTokenizerFast + + >>> # batch_size = 2, num_candidates = 2 + >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]] + + >>> tokenizer = RealmTokenizerFast.from_pretrained("google/realm-cc-news-pretrained-encoder") + >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt") + ```""" + + # Always using a fixed sequence length to encode in order to stack candidates into a batch. + kwargs["padding"] = PaddingStrategy.MAX_LENGTH + + batch_text = text + batch_text_pair = kwargs.pop("text_pair", None) + return_tensors = kwargs.pop("return_tensors", None) + + output_data = { + "input_ids": [], + "attention_mask": [], + "token_type_ids": [], + } + + for idx, candidate_text in enumerate(batch_text): + if batch_text_pair is not None: + candidate_text_pair = batch_text_pair[idx] + else: + candidate_text_pair = None + + encoded_candidates = super().__call__(candidate_text, candidate_text_pair, return_tensors=None, **kwargs) + + encoded_input_ids = encoded_candidates.get("input_ids") + encoded_attention_mask = encoded_candidates.get("attention_mask") + encoded_token_type_ids = encoded_candidates.get("token_type_ids") + + if encoded_input_ids is not None: + output_data["input_ids"].append(encoded_input_ids) + if encoded_attention_mask is not None: + output_data["attention_mask"].append(encoded_attention_mask) + if encoded_token_type_ids is not None: + output_data["token_type_ids"].append(encoded_token_type_ids) + + output_data = {key: item for key, item in output_data.items() if len(item) != 0} + + return BatchEncoding(output_data, tensor_type=return_tensors) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A REALM sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["RealmTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a875576607430c041abab01eccaf468a6cc9272e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_retribert import * + from .modeling_retribert import * + from .tokenization_retribert import * + from .tokenization_retribert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py new file mode 100644 index 0000000000000000000000000000000000000000..80d755a16961450cae783d834385e5e6873dc24e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py @@ -0,0 +1,108 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""RetriBERT model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class RetriBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RetriBertModel`]. It is used to instantiate a + RetriBertModel model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the RetriBERT + [yjernite/retribert-base-uncased](https://huggingface.co/yjernite/retribert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the RetriBERT model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`RetriBertModel`] + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the *token_type_ids* passed into [`BertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + share_encoders (`bool`, *optional*, defaults to `True`): + Whether or not to use the same Bert-type encoder for the queries and document + projection_dim (`int`, *optional*, defaults to 128): + Final dimension of the query and document representation after projection + """ + + model_type = "retribert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=8, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + share_encoders=True, + projection_dim=128, + pad_token_id=0, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.share_encoders = share_encoders + self.projection_dim = projection_dim + + +__all__ = ["RetriBertConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py new file mode 100644 index 0000000000000000000000000000000000000000..06806e8e6d0bb01fa31c6849ccb23f3e29bf74aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py @@ -0,0 +1,217 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +RetriBERT model +""" + +import math +from typing import Optional + +import torch +import torch.utils.checkpoint as checkpoint +from torch import nn + +from ....modeling_utils import PreTrainedModel +from ....utils import add_start_docstrings, logging +from ...bert.modeling_bert import BertModel +from .configuration_retribert import RetriBertConfig + + +logger = logging.get_logger(__name__) + + +# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # +class RetriBertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: RetriBertConfig + load_tf_weights = None + base_model_prefix = "retribert" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +RETRIBERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`RetriBertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """Bert Based model to embed queries or document for document retrieval.""", + RETRIBERT_START_DOCSTRING, +) +class RetriBertModel(RetriBertPreTrainedModel): + def __init__(self, config: RetriBertConfig) -> None: + super().__init__(config) + self.projection_dim = config.projection_dim + + self.bert_query = BertModel(config) + self.bert_doc = None if config.share_encoders else BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.project_query = nn.Linear(config.hidden_size, config.projection_dim, bias=False) + self.project_doc = nn.Linear(config.hidden_size, config.projection_dim, bias=False) + + self.ce_loss = nn.CrossEntropyLoss(reduction="mean") + + # Initialize weights and apply final processing + self.post_init() + + def embed_sentences_checkpointed( + self, + input_ids, + attention_mask, + sent_encoder, + checkpoint_batch_size=-1, + ): + # reproduces BERT forward pass with checkpointing + if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size: + return sent_encoder(input_ids, attention_mask=attention_mask)[1] + else: + # prepare implicit variables + device = input_ids.device + input_shape = input_ids.size() + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + head_mask = [None] * sent_encoder.config.num_hidden_layers + extended_attention_mask: torch.Tensor = sent_encoder.get_extended_attention_mask( + attention_mask, input_shape + ) + + # define function for checkpointing + def partial_encode(*inputs): + encoder_outputs = sent_encoder.encoder( + inputs[0], + attention_mask=inputs[1], + head_mask=head_mask, + ) + sequence_output = encoder_outputs[0] + pooled_output = sent_encoder.pooler(sequence_output) + return pooled_output + + # run embedding layer on everything at once + embedding_output = sent_encoder.embeddings( + input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, inputs_embeds=None + ) + # run encoding and pooling on one mini-batch at a time + pooled_output_list = [] + for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)): + b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size] + b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size] + pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask) + pooled_output_list.append(pooled_output) + return torch.cat(pooled_output_list, dim=0) + + def embed_questions( + self, + input_ids, + attention_mask=None, + checkpoint_batch_size=-1, + ): + q_reps = self.embed_sentences_checkpointed( + input_ids, + attention_mask, + self.bert_query, + checkpoint_batch_size, + ) + return self.project_query(q_reps) + + def embed_answers( + self, + input_ids, + attention_mask=None, + checkpoint_batch_size=-1, + ): + a_reps = self.embed_sentences_checkpointed( + input_ids, + attention_mask, + self.bert_query if self.bert_doc is None else self.bert_doc, + checkpoint_batch_size, + ) + return self.project_doc(a_reps) + + def forward( + self, + input_ids_query: torch.LongTensor, + attention_mask_query: Optional[torch.FloatTensor], + input_ids_doc: torch.LongTensor, + attention_mask_doc: Optional[torch.FloatTensor], + checkpoint_batch_size: int = -1, + ) -> torch.FloatTensor: + r""" + Args: + input_ids_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary for the queries in a batch. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask_query (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + input_ids_doc (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary for the documents in a batch. + attention_mask_doc (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on documents padding token indices. + checkpoint_batch_size (`int`, *optional*, defaults to `-1`): + If greater than 0, uses gradient checkpointing to only compute sequence representation on + `checkpoint_batch_size` examples at a time on the GPU. All query representations are still compared to + all document representations in the batch. + + Return: + `torch.FloatTensor``: The bidirectional cross-entropy loss obtained while trying to match each query to its + corresponding document and each document to its corresponding query in the batch + """ + device = input_ids_query.device + q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size) + a_reps = self.embed_answers(input_ids_doc, attention_mask_doc, checkpoint_batch_size) + compare_scores = torch.mm(q_reps, a_reps.t()) + loss_qa = self.ce_loss(compare_scores, torch.arange(compare_scores.shape[1]).to(device)) + loss_aq = self.ce_loss(compare_scores.t(), torch.arange(compare_scores.shape[0]).to(device)) + loss = (loss_qa + loss_aq) / 2 + return loss + + +__all__ = ["RetriBertModel", "RetriBertPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py new file mode 100644 index 0000000000000000000000000000000000000000..288b46e267bccb358fa86e74ac2b98dcb99e8904 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py @@ -0,0 +1,475 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for RetriBERT.""" + +import collections +import os +import unicodedata +from typing import Optional + +from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ....utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class RetriBertTokenizer(PreTrainedTokenizer): + r""" + Constructs a RetriBERT tokenizer. + + [`RetriBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting + and wordpiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer + to: this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text, split_special_tokens=False): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens if not split_special_tokens else None + ): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["RetriBertTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..5fcbc395a1b7ba7f659a9befae14c8ba658c27d0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py @@ -0,0 +1,150 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for RetriBERT.""" + +import json +from typing import Optional + +from tokenizers import normalizers + +from ....tokenization_utils_fast import PreTrainedTokenizerFast +from ....utils import logging +from .tokenization_retribert import RetriBertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +class RetriBertTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's *tokenizers* library). + + [`RetriBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation + splitting and wordpiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = RetriBertTokenizer + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["RetriBertTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78c549b6e294e1ca97a718ef601472d4d400e12a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_speech_to_text_2 import * + from .modeling_speech_to_text_2 import * + from .processing_speech_to_text_2 import * + from .tokenization_speech_to_text_2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py new file mode 100644 index 0000000000000000000000000000000000000000..a7cde922740a9a2173c64311aec8f6ad5496ac03 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Speech2Text model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class Speech2Text2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Speech2Text2ForCausalLM`]. It is used to + instantiate an Speech2Text2 model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text2 + [facebook/s2t-wav2vec2-large-en-de](https://huggingface.co/facebook/s2t-wav2vec2-large-en-de) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`Speech2TextModel`] + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the pooler. If string, `"gelu"`, `"relu"`, + `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + https://huggingface.co/papers/1909.11556>`__ for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + max_target_positions (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + + Example: + + ```python + >>> from transformers import Speech2Text2Config, Speech2Text2ForCausalLM + + >>> # Initializing a Speech2Text2 s2t_transformer_s style configuration + >>> configuration = Speech2Text2Config() + + >>> # Initializing a model (with random weights) from the s2t_transformer_s style configuration + >>> model = Speech2Text2ForCausalLM(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "speech_to_text_2" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=10000, + decoder_layers=6, + decoder_ffn_dim=2048, + decoder_attention_heads=4, + decoder_layerdrop=0.0, + use_cache=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + decoder_start_token_id=2, + scale_embedding=True, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + max_target_positions=1024, + **kwargs, + ): + self.vocab_size = vocab_size + self.d_model = d_model + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.decoder_layerdrop = decoder_layerdrop + self.use_cache = use_cache + self.num_hidden_layers = decoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + self.max_target_positions = max_target_positions + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) + + +__all__ = ["Speech2Text2Config"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py new file mode 100644 index 0000000000000000000000000000000000000000..854f21c0655046124fb439ec3e4a44395c33abcc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py @@ -0,0 +1,905 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Speech2Text2 model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions +from ....modeling_utils import PreTrainedModel +from ....utils import add_start_docstrings, logging, replace_return_docstrings +from ....utils.deprecation import deprecate_kwarg +from .configuration_speech_to_text_2 import Speech2Text2Config + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "Speech2Text2Config" +_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de" + + +class Speech2Text2SinusoidalPositionalEmbedding(nn.Module): + """This module produces sinusoidal positional embeddings of any length.""" + + def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None): + super().__init__() + self.offset = 2 + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + + def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx) + if hasattr(self, "weights"): + # in forward put the weights on the correct dtype and device of the param + emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) + + self.weights = nn.Parameter(emb_weights) + self.weights.requires_grad = False + self.weights.detach_() + + @staticmethod + def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + """ + Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the + description in Section 3.5 of "Attention Is All You Need". + """ + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb) + emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) + if embedding_dim % 2 == 1: + # zero pad + emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) + if padding_idx is not None: + emb[padding_idx, :] = 0 + return emb.to(torch.get_default_dtype()) + + @torch.no_grad() + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + bsz, seq_len = input_ids.size() + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to( + input_ids.device + ) + + # expand embeddings if needed + max_pos = self.padding_idx + 1 + seq_len + if max_pos > self.weights.size(0): + self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) + + return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach() + + def create_position_ids_from_input_ids( + self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0 + ): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding + symbols are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +class Speech2Text2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[Speech2Text2Config] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_values[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_values` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_values is not None + and past_key_values[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_values[0] + value_states = past_key_values[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_values is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_values[0], key_states], dim=2) + value_states = torch.cat([past_key_values[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_values + + +class Speech2Text2DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Speech2Text2Config): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = Speech2Text2Attention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + + if config.is_decoder: + self.encoder_attn = Speech2Text2Attention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size *(decoder_attention_heads,)*. + past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_values=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_values=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class Speech2Text2PreTrainedModel(PreTrainedModel): + config: Speech2Text2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, Speech2Text2SinusoidalPositionalEmbedding): + weight = module.get_embedding(*module.weight.shape, module.padding_idx) + weight = nn.Parameter(weight, requires_grad=False) + weight.detach_() + module.weight = weight + + +SPEECH_TO_TEXT_2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Speech2Text2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +class Speech2Text2Decoder(Speech2Text2PreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2Text2DecoderLayer`] + + Args: + config: Speech2Text2Config + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: Speech2Text2Config): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_target_positions + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) + + self.embed_positions = Speech2Text2SinusoidalPositionalEmbedding( + self.max_target_positions, + config.d_model, + self.padding_idx, + ) + + self.layers = nn.ModuleList([Speech2Text2DecoderLayer(config) for _ in range(config.decoder_layers)]) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention + on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length) + + hidden_states = inputs_embeds + positions + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None), + past_key_values=past_key_values[idx] if past_key_values is not None else None, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The Speech2Text2 Model with a language modeling head. Can be used for summarization.", + SPEECH_TO_TEXT_2_START_DOCSTRING, +) +class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = Speech2Text2Decoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +@add_start_docstrings( + "The Speech2Text2 Decoder with a language modeling head. Can be used as the decoder part of" + " [`EncoderDecoderModel`] and [`SpeechEncoderDecoder`].", + SPEECH_TO_TEXT_2_START_DOCSTRING, +) +class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = Speech2Text2DecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import ( + ... SpeechEncoderDecoderModel, + ... Speech2Text2ForCausalLM, + ... Wav2Vec2Model, + ... Speech2Text2Config, + ... Wav2Vec2Config, + ... Wav2Vec2FeatureExtractor, + ... Speech2Text2Tokenizer, + ... ) + >>> from datasets import load_dataset + + >>> feature_extractor = Wav2Vec2FeatureExtractor() + >>> tokenizer = Speech2Text2Tokenizer.from_pretrained("facebook/s2t-wav2vec2-large-en-de") + + >>> encoder = Wav2Vec2Model(Wav2Vec2Config()) + >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config()) + >>> # init random speech2text model + + >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder) + >>> model.config.pad_token_id = tokenizer.pad_token_id + >>> model.config.decoder_start_token_id = tokenizer.bos_token_id + >>> # pre-process inputs and labels + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> inputs = feature_extractor( + ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" + ... ) + >>> input_values = inputs.input_values + >>> decoder_input_ids = tokenizer(ds[0]["text"], return_tensors="pt").input_ids + >>> # compute loss + + >>> loss = model(inputs=input_values, labels=decoder_input_ids).loss + >>> # backprop loss + + >>> loss.backward() # doctest: +IGNORE_RESULT + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + if past_key_values: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + # first step, decoder_cached_states are empty + return { + "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "use_cache": use_cache, + } + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = ["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py new file mode 100644 index 0000000000000000000000000000000000000000..b69420bf540cbe737d6509b79d943af9d41e6c24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py @@ -0,0 +1,105 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Speech processor class for Speech2Text2 +""" + +import warnings +from contextlib import contextmanager + +from ....processing_utils import ProcessorMixin + + +class Speech2Text2Processor(ProcessorMixin): + r""" + Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into + a single processor. + + [`Speech2Text2Processor`] offers all the functionalities of [`AutoFeatureExtractor`] and [`Speech2Text2Tokenizer`]. + See the [`~Speech2Text2Processor.__call__`] and [`~Speech2Text2Processor.decode`] for more information. + + Args: + feature_extractor (`AutoFeatureExtractor`): + An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input. + tokenizer (`Speech2Text2Tokenizer`): + An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input. + """ + + feature_extractor_class = "AutoFeatureExtractor" + tokenizer_class = "Speech2Text2Tokenizer" + + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) + self.current_processor = self.feature_extractor + self._in_target_context_manager = False + + def __call__(self, *args, **kwargs): + """ + When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's + [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context + [`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to + Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the docstring of the above two + methods for more information. + """ + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + sampling_rate = kwargs.pop("sampling_rate", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs + + @contextmanager + def as_target_processor(self): + """ + Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning + Speech2Text2. + """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True + self.current_processor = self.tokenizer + yield + self.current_processor = self.feature_extractor + self._in_target_context_manager = False + + +__all__ = ["Speech2Text2Processor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py new file mode 100644 index 0000000000000000000000000000000000000000..05c10080b7732601efd476a8c6e2640e74450a76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py @@ -0,0 +1,252 @@ +# coding=utf-8 +# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for Speech2Text2.""" + +import json +import os +from typing import Optional + +from ....tokenization_utils import PreTrainedTokenizer +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "tokenizer_config_file": "tokenizer_config.json", + "merges_file": "merges.txt", +} + + +BPE_TOKEN_MERGES = "" +BPE_TOKEN_VOCAB = "@@ " + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length + strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +# Speech2Text2 has no max input length + + +class Speech2Text2Tokenizer(PreTrainedTokenizer): + """ + Constructs a Speech2Text2Tokenizer. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to + the superclass for more information regarding such methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sentence token. + eos_token (`str`, *optional*, defaults to `""`): + The end of sentence token. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + + **kwargs + Additional keyword arguments passed along to [`PreTrainedTokenizer`] + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + pad_token="", + eos_token="", + unk_token="", + do_lower_case=False, + merges_file=None, + **kwargs, + ): + self.do_lower_case = do_lower_case + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + + if merges_file is None: + logger.info(f"No merges files provided. {self.__class__.__name__} can only be used for decoding.") + + self.bpe_ranks = None + self.cache = None + else: + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[:-1] + + merges = [tuple(merge.split()[:2]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + do_lower_case=do_lower_case, + **kwargs, + ) + + @property + def vocab_size(self) -> int: + return len(self.decoder) + + def get_vocab(self) -> dict: + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + BPE_TOKEN_MERGES,) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + if word == "\n " + BPE_TOKEN_MERGES: + word = "\n" + BPE_TOKEN_MERGES + + if word.endswith(BPE_TOKEN_MERGES): + word = word.replace(BPE_TOKEN_MERGES, "") + + word = word.replace(" ", BPE_TOKEN_VOCAB) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + + if self.bpe_ranks is None: + raise ValueError( + "This tokenizer was instantiated without a `merges.txt` file, so" + " that it can only be used for decoding, not for encoding. " + "Make sure to provide `merges.txt` file at instantiation to enable " + "encoding." + ) + + if self.do_lower_case: + text = text.lower() + + text = text.split() + + split_tokens = [] + for token in text: + if token: + split_tokens.extend(list(self.bpe(token).split(" "))) + + return split_tokens + + def _convert_token_to_id(self, token: str) -> int: + """Converts a token (str) in an index (integer) using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the vocab.""" + result = self.decoder.get(index, self.unk_token) + return result + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + """ + Converts a list of output tokens into a single string. + """ + # combine tokens + string = " ".join(tokens) + + # make sure @@ tokens are concatenated + string = "".join(string.split(BPE_TOKEN_VOCAB)) + + return string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merges_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + if self.bpe_ranks is None: + return (vocab_file,) + + with open(merges_file, "w", encoding="utf-8") as writer: + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return (vocab_file, merges_file) + + +__all__ = ["Speech2Text2Tokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b535eb1df2c40a7680bbf8d57fc70b78e23437f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_tapex import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py new file mode 100644 index 0000000000000000000000000000000000000000..b32383ddd497828b6a429d39c5d4069a21154b9f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py @@ -0,0 +1,1470 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for TAPEX.""" + +import json +import os +import random +from functools import lru_cache +from typing import Optional, Union + +import regex as re + +from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available +from ....tokenization_utils import AddedToken, PreTrainedTokenizer +from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy +from ....utils import logging + + +if is_pandas_available(): + import pandas as pd + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} + + +class TapexTruncationStrategy(ExplicitEnum): + """ + Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for tab-completion in an IDE. + """ + + DROP_ROWS_TO_FIT = "drop_rows_to_fit" + + +TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to encode the sequences with the special tokens relative to their model. + padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str`, [`TapexTruncationStrategy`] or [`~tokenization_utils_base.TruncationStrategy`], + *optional*, defaults to `False`): + + Activates and controls truncation. Accepts the following values: + + - `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will truncate + row by row, removing rows from the table. + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to + `None`, this will use the predefined model maximum length if a maximum length is required by one of the + truncation/padding parameters. If the model has no specific maximum input length (like XLNet) + truncation/padding to a maximum length will be deactivated. + stride (`int`, *optional*, defaults to 0): + If set to a number along with `max_length`, the overflowing tokens returned when + `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. +""" + + +@lru_cache +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large # + of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset + you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe + vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length + strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class IndexedRowTableLinearize: + """ + FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ... + """ + + def process_table(self, table_content: dict): + """ + Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols. + """ + assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE + # process header + table_str = self.process_header(table_content["header"]) + " " + # process rows + for i, row_example in enumerate(table_content["rows"]): + # NOTE: the row should start from row 1 instead of 0 + table_str += self.process_row(row_example, row_index=i + 1) + " " + return table_str.strip() + + def process_header(self, headers: list): + """ + Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols. + """ + return "col : " + " | ".join(headers) + + def process_row(self, row: list, row_index: int): + """ + Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols. + """ + row_str = "" + row_cell_values = [] + for cell_value in row: + if isinstance(cell_value, int): + row_cell_values.append(str(cell_value)) + else: + row_cell_values.append(cell_value) + row_str += " | ".join(row_cell_values) + return "row " + str(row_index) + " : " + row_str + + +class TapexTokenizer(PreTrainedTokenizer): + r""" + Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE). + + This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences + to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following: + + sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ... + + The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table + will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated + for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to + the tokenizer for instance to prepare them for the model. + + Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (BART tokenizer detect beginning of words by the preceding space). + max_cell_length (`int`, *optional*, defaults to 15): + Maximum number of characters per cell when linearizing a table. If this number is exceeded, truncation + takes place. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + do_lower_case=True, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + max_cell_length=15, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + self.do_lower_case = do_lower_case + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + # additional properties + + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + do_lower_case=do_lower_case, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + max_cell_length=max_cell_length, + **kwargs, + ) + + self.max_cell_length = max_cell_length + self.table_linearize = IndexedRowTableLinearize() + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A TAPEX sequence has the following format: + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Args: + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Args: + Create a mask from the two sequences passed to be used in a sequence-pair classification task. TAPEX does not: + make use of token type ids, therefore a list of zeros is returned. + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]] = None, + query: Optional[Union[TextInput, list[TextInput]]] = None, + answer: Optional[Union[str, list[str]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several table-sequence pair(s). + + Args: + table (`pd.DataFrame`, `list[pd.DataFrame]`): + Table(s) containing tabular data. + query (`str` or `list[str]`, *optional*): + Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of + sentences must match the number of tables. + answer (`str` or `list[str]`, *optional*): + Optionally, the corresponding answer to the questions as supervision. + """ + + if table is not None: + return self.source_call_func( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + elif answer is not None: + return self.target_call_func( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + raise ValueError("You need to provide either a `table` or an `answer`.") + + def source_call_func( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[Union[TextInput, list[TextInput]]] = None, + answer: Optional[Union[str, list[str]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + # Input type checking for clearer error + valid_table = False + valid_query = False + + # Check that table have a valid type + if isinstance(table, pd.DataFrame): + valid_table = True + elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame): + valid_table = True + + # Check that query have a valid type + if query is None or isinstance(query, str): + valid_query = True + elif isinstance(query, (list, tuple)): + if len(query) == 0 or isinstance(query[0], str): + valid_query = True + + if not valid_table: + raise ValueError( + "table input must of type `pd.DataFrame` (single example), `list[pd.DataFrame]` (batch of examples). " + ) + if not valid_query: + raise ValueError("query input must of type `str` (single example), `list[str]` (batch of examples). ") + is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple)) + + if is_batched: + return self.batch_encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def batch_encode_plus( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[list[TextInput]] = None, + answer: Optional[list[str]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + + + This method is deprecated, `__call__` should be used instead. + + + """ + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._batch_encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _batch_encode_plus( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[list[TextInput]] = None, + answer: Optional[list[str]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + ) + + if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)): + # single table, many queries case + # duplicate table for every query + table = [table] * len(query) + if isinstance(table, (list, tuple)) and isinstance(query, str): + # many tables, single query case + # duplicate query for every table + query = [query] * len(table) + + batch_outputs = self._batch_prepare_for_model( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=return_tensors, + verbose=verbose, + ) + + return BatchEncoding(batch_outputs) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def _batch_prepare_for_model( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[Union[TextInput, list[TextInput]]] = None, + answer: Optional[Union[str, list[str]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + ) -> BatchEncoding: + """ + This method adds special tokens, truncates sequences if overflowing while taking into account the special + tokens and manages a moving window (with user defined stride) for overflowing tokens. + """ + batch_outputs = {} + if answer is None: + answer = [None] * len(table) + for _table, _query, _answer in zip(table, query, answer): + text = self.prepare_table_query( + _table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length + ) + + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + outputs = self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterwards + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterwards + return_attention_mask=False, # we pad in batch afterwards + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return batch_outputs + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING) + def encode( + self, + table: "pd.DataFrame", + query: Optional[TextInput] = None, + answer: Optional[str] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> list[int]: + """ + Prepare a table, a string and possible answer for the model. This method does not return token type IDs, + attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build + your processing on your own, otherwise refer to `__call__`. + """ + encoded_inputs = self.encode_plus( + table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_inputs["input_ids"] + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def encode_plus( + self, + table: "pd.DataFrame", + query: Optional[TextInput] = None, + answer: Optional[str] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _encode_plus( + self, + table: "pd.DataFrame", + query: Optional[TextInput] = None, + answer: Optional[str] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast. " + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + text = self.prepare_table_query( + table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length + ) + + # if necessary, perform lower case + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + + return self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + def target_call_func( + self, + answer: Union[str, list[str]], + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + The method tokenizes and prepares the answer label for the model. + + Args: + answer (`str` or `list[str]`): + Corresponding answer supervision to the queries for training the model. + """ + is_batched = isinstance(answer, (list, tuple)) + + if is_batched: + return self.target_batch_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.target_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def target_batch_encode_plus( + self, + answer: list[str], + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Prepare answer strings for the model. + + Args: + answer `list[str]`: + Corresponding answer supervision to the queries for training the model. + """ + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._target_batch_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _target_batch_encode_plus( + self, + answer: list[str], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + batch_outputs = {} + for text in answer: + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + outputs = self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterwards + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterwards + return_attention_mask=False, # we pad in batch afterwards + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return BatchEncoding(batch_outputs) + + def target_encode( + self, + answer: str, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> list[int]: + """ + Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc. + which are necessary for the model to work correctly. Use this method if you want to build your processing on + your own, otherwise refer to `__call__`. + + Args: + answer `str`: + Corresponding answer supervision to the queries for training the model + """ + encoded_outputs = self.target_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_outputs["input_ids"] + + def target_encode_plus( + self, + answer: str, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Prepare a answer string for the model. + + Args: + answer `str`: + Corresponding answer supervision to the queries for training the model. + """ + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._target_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _target_encode_plus( + self, + answer: str, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast. " + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + text = answer + + # if necessary, perform lower case + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + + return self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + def prepare_table_query( + self, + table, + query, + answer=None, + truncation_strategy=Union[str, TruncationStrategy, TapexTruncationStrategy], + max_length=None, + ): + """ + This method can be used to linearize a table and add a corresponding query. + + Optionally, it also handles truncation of the table (cells). + + An answer can be provided for more precise truncation. + """ + if not table.empty: + # step 1: create table dictionary + table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]} + + # step 2: modify table internally + # always truncate table cells based on self.max_cell_length + # optionally truncate rows if truncation_strategy is set to it + self.truncate_table_cells(table_content, query, answer) + if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT: + self.truncate_table_rows(table_content, query, answer, max_length=max_length) + + # step 3: linearize table + linear_table = self.table_linearize.process_table(table_content) + else: + linear_table = "" + + if linear_table == "": + logger.warning( + "You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). " + + f"Please carefully check the corresponding table with the query : {query}." + ) + if query == "": + logger.warning("You provide nothing to query with respect to the table.") + # step 4: concatenate query with linear_table + separator = " " if query and linear_table else "" + joint_input = (query + separator + linear_table) if query else linear_table + + return joint_input + + def truncate_table_cells(self, table_content: dict, question: str, answer: list): + # TODO (Qian): is it possible to revert the original cell if it is in the final answer? + cell_mapping = {} + for row in table_content["rows"]: + for i, cell in enumerate(row): + truncate_cell = self.truncate_cell(cell) + if truncate_cell is not None: + cell_mapping[cell] = truncate_cell + row[i] = truncate_cell + + # modify the answer list + if answer is not None: + for i, case in enumerate(answer): + if case in cell_mapping: + answer[i] = cell_mapping[case] + + def truncate_cell(self, cell_value): + # do not process on these cases + if isinstance(cell_value, (int, float)): + return cell_value + if cell_value.strip() != "": + try_tokens = self.tokenize(cell_value) + if len(try_tokens) >= self.max_cell_length: + retain_tokens = try_tokens[: self.max_cell_length] + retain_cell_value = self.convert_tokens_to_string(retain_tokens) + return retain_cell_value + else: + return None + else: + return cell_value + + def truncate_table_rows( + self, table_content: dict, question: str, answer: Optional[Union[str, list[str]]] = None, max_length=None + ): + """ + Args: + table_content: + {"header": xxx, "rows": xxx, "id" (Optionally): xxx} + + question: + natural language sentence + + answer: + if for training, is the supervision; otherwise will be empty + """ + delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length) + # randomly delete unrelated rows + self.delete_unrelated_rows(table_content, question, answer, delete_ratio) + # guarantee the result < max_length + maximum_keep_rows = 0 + for ind, row_example in enumerate(table_content["rows"]): + value_string = self.table_linearize.process_row(row_example, ind + 1) + value_token_len = len(self.tokenize(value_string)) + # over the size limit, and take action + if value_token_len > remain_token_len: + break + remain_token_len -= value_token_len + maximum_keep_rows += 1 + del table_content["rows"][maximum_keep_rows:] + + def estimate_delete_ratio(self, table_content: dict, question: str, max_length=None): + if "header" not in table_content or "rows" not in table_content: + raise ValueError("The table content should contain both 'header' and 'rows' keys.") + # calculate the tokens of header, special tokens will only be pre-prepended into question + question_tokens = self.tokenize(question, add_special_tokens=True) + # calculate the tokens of header + header_string = self.table_linearize.process_header(table_content["header"]) + header_tokens = self.tokenize(header_string, add_special_tokens=False) + # split all cell values into tokens and see how many can be accommodated + used_token_len = len(question_tokens) + len(header_tokens) + # remaining token space for rows + remain_token_len = max_length - used_token_len + + value_string = "" + for _, row_example in enumerate(table_content["rows"]): + # use a general index to roughly estimate the overall token len + value_string += self.table_linearize.process_row(row_example, 100) + " " + value_token_len = len(self.tokenize(value_string)) + + if value_token_len < remain_token_len: + # no row will be deleted + return 0.0, remain_token_len + else: + # calc a roughly delete rate + return 1.0 - remain_token_len / value_token_len, remain_token_len + + def delete_unrelated_rows(self, table_content: dict, question: str, answer: list, delete_ratio: float): + """ + The argument answer is used only during training. + """ + truncated_unrelated_indices = [] + related_indices = [] + if answer is None or len(answer) == 0: + answer_set = set() + else: + answer_set = {ans_ex.lower() for ans_ex in answer} + # add question key words into answer set + if question is not None: + answer_set.update(question.split()) + question_set = set(question.strip("?!.,").split(" ")) + row_max_len = len(table_content["rows"]) + for _row_idx, row in enumerate(table_content["rows"]): + lower_row = {str(cell).lower() for cell in row} + if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0: + truncated_unrelated_indices.append(_row_idx) + else: + # add neighbours to preserve information aggressively + related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2]) + + # remove the neighbours + truncated_unrelated_indices = [ + _row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices + ] + # select some cases to drop + drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio)) + drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items) + + for _row_idx in reversed(range(row_max_len)): + if _row_idx in drop_row_indices: + del table_content["rows"][_row_idx] + + # only when the drop ratio is too large, logging for warning. + if "id" in table_content and len(drop_row_indices) > 0: + logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"])) + + +__all__ = ["TapexTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b4bccdd12c9703313951e223d0ac1955f9ca1581 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_trajectory_transformer import * + from .modeling_trajectory_transformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..5a267cc4c4b82c6703eaa3a732d3fca1606a60d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TrajectoryTransformer model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class TrajectoryTransformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`TrajectoryTransformerModel`]. It is used to + instantiate an TrajectoryTransformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the + TrajectoryTransformer + [CarlCochet/trajectory-transformer-halfcheetah-medium-v2](https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 100): + Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be + represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`] + action_weight (`int`, *optional*, defaults to 5): + Weight of the action in the loss function + reward_weight (`int`, *optional*, defaults to 1): + Weight of the reward in the loss function + value_weight (`int`, *optional*, defaults to 1): + Weight of the value in the loss function + block_size (`int`, *optional*, defaults to 249): + Size of the blocks in the trajectory transformer. + action_dim (`int`, *optional*, defaults to 6): + Dimension of the action space. + observation_dim (`int`, *optional*, defaults to 17): + Dimension of the observation space. + transition_dim (`int`, *optional*, defaults to 25): + Dimension of the transition space. + n_layer (`int`, *optional*, defaults to 4): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 4): + Number of attention heads for each attention layer in the Transformer encoder. + n_embd (`int`, *optional*, defaults to 128): + Dimensionality of the embeddings and hidden states. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + kaiming_initializer_range (`float, *optional*, defaults to 1): + A coefficient scaling the negative slope of the kaiming initializer rectifier for EinLinear layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + Example: + + ```python + >>> from transformers import TrajectoryTransformerConfig, TrajectoryTransformerModel + + >>> # Initializing a TrajectoryTransformer CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration + >>> configuration = TrajectoryTransformerConfig() + + >>> # Initializing a model (with random weights) from the CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration + >>> model = TrajectoryTransformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "trajectory_transformer" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=100, + action_weight=5, + reward_weight=1, + value_weight=1, + block_size=249, + action_dim=6, + observation_dim=17, + transition_dim=25, + n_layer=4, + n_head=4, + n_embd=128, + embd_pdrop=0.1, + attn_pdrop=0.1, + resid_pdrop=0.1, + learning_rate=0.0006, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + kaiming_initializer_range=1, + use_cache=True, + pad_token_id=1, + bos_token_id=50256, + eos_token_id=50256, + **kwargs, + ): + self.vocab_size = vocab_size + self.action_weight = action_weight + self.reward_weight = reward_weight + self.value_weight = value_weight + self.max_position_embeddings = max_position_embeddings + self.block_size = block_size + self.action_dim = action_dim + self.observation_dim = observation_dim + self.transition_dim = transition_dim + self.learning_rate = learning_rate + self.n_layer = n_layer + self.n_head = n_head + self.n_embd = n_embd + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.resid_pdrop = resid_pdrop + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.kaiming_initializer_range = kaiming_initializer_range + self.use_cache = use_cache + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +__all__ = ["TrajectoryTransformerConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b6ae410c1474140c018c88bb5cbdaf8ebf076226 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py @@ -0,0 +1,602 @@ +# coding=utf-8 +# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch TrajectoryTransformer model.""" + +import math +import os +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from ....cache_utils import Cache +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_utils import PreTrainedModel +from ....utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_trajectory_transformer import TrajectoryTransformerConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "CarlCochet/trajectory-transformer-halfcheetah-medium-v2" +_CONFIG_FOR_DOC = "TrajectoryTransformerConfig" + + +def load_tf_weights_in_trajectory_transformer(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +@dataclass +class TrajectoryTransformerOutput(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple[tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads, + sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the + attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. GPT2Attentions weights after the attention softmax, used to compute the weighted average + in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +class TrajectoryTransformerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: TrajectoryTransformerConfig + load_tf_weights = load_tf_weights_in_trajectory_transformer + base_model_prefix = "trajectory_transformer" + main_input_name = "trajectories" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, EinLinear): + for i in range(module.n_models): + nn.init.kaiming_uniform_(module.weight[i], a=math.sqrt(5) / self.config.kaiming_initializer_range) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight[i]) + bound = (1 / math.sqrt(fan_in)) * self.config.initializer_range + nn.init.uniform_(module.bias[i], -bound, bound) + + +TRAJECTORY_TRANSFORMER_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`TrajectoryTransformerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING = r""" + Args: + trajectories (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Batch of trajectories, where a trajectory is a sequence of states, actions and rewards. + past_key_values (`tuple[tuple[torch.Tensor]]` of length `config.n_layers`, *optional*): + Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see + `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have + their past given to this model should not be passed as `input_ids` as they have already been computed. + targets (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Desired targets used to compute the loss. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class EinLinear(nn.Module): + def __init__(self, n_models, in_features, out_features, bias): + super().__init__() + self.n_models = n_models + self.out_features = out_features + self.in_features = in_features + self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features)) + if bias: + self.bias = nn.Parameter(torch.Tensor(n_models, out_features)) + else: + self.register_parameter("bias", None) + + def reset_parameters(self): + for i in range(self.n_models): + nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i]) + bound = 1 / math.sqrt(fan_in) + nn.init.uniform_(self.bias[i], -bound, bound) + + def forward(self, input): + """ + Args: + input (`torch.FloatTensor` of shape `(B, n_models, input_dim)`): + The input to the layer. + """ + # [ batch_size x n_models x output_dim ] + output = torch.einsum("eoi,bei->beo", self.weight, input) + if self.bias is not None: + raise RuntimeError() + return output + + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + + if config.n_embd % config.n_head != 0: + raise ValueError(f"n_head ({config.n_head}) should be a divisor of n_embd ({config.n_embd})") + + # key, query, value projections for all heads + self.key = nn.Linear(config.n_embd, config.n_embd) + self.query = nn.Linear(config.n_embd, config.n_embd) + self.value = nn.Linear(config.n_embd, config.n_embd) + + # regularization + self.attn_drop = nn.Dropout(config.attn_pdrop) + self.resid_drop = nn.Dropout(config.resid_pdrop) + + # output projection + self.proj = nn.Linear(config.n_embd, config.n_embd) + + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer( + "mask", + torch.tril(torch.ones(config.block_size, config.block_size)).view( + 1, 1, config.block_size, config.block_size + ), + persistent=False, + ) + + # mask previous value estimates + joined_dim = config.observation_dim + config.action_dim + 2 + self.mask.squeeze()[:, joined_dim - 1 :: joined_dim] = 0 + + self.n_head = config.n_head + + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + layer_past: Optional[tuple[torch.Tensor]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ): + batch_size, sequence_length, embedding_dim = hidden_states.size() + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + # [ batch_size x n_heads x sequence_length x head_dim ] + key = ( + self.key(hidden_states) + .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head) + .transpose(1, 2) + ) + query = ( + self.query(hidden_states) + .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head) + .transpose(1, 2) + ) + value = ( + self.value(hidden_states) + .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head) + .transpose(1, 2) + ) + + if layer_past is not None: + past_key, past_value = layer_past + key = torch.cat((past_key, key), dim=-2) + value = torch.cat((past_value, value), dim=-2) + + if use_cache is True: + present = (key, value) + else: + present = None + + # causal self-attention + # [ batch_size x n_heads x sequence_length x sequence_length ] + attn_weights = (torch.matmul(query, key.transpose(-2, -1))) * (1.0 / math.sqrt(key.size(-1))) + attn_weights = attn_weights.masked_fill( + self.mask[:, :, :sequence_length, :sequence_length] == 0, torch.finfo(attn_weights.dtype).min + ) + attn_weights = F.softmax(attn_weights, dim=-1) + self._attn_map = attn_weights.clone() + attn_weights = self.attn_drop(attn_weights) + + output = torch.matmul(attn_weights, value) + # [ batch_size x sequence_length x embedding_dim ] + # re-assemble all head outputs side by side + output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, embedding_dim) + + # output projection + output = self.resid_drop(self.proj(output)) + + outputs = (output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Block(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config.n_embd) + self.ln2 = nn.LayerNorm(config.n_embd) + self.attn = CausalSelfAttention(config) + + # MLP + self.l1 = nn.Linear(config.n_embd, 4 * config.n_embd) + self.act = nn.GELU() + self.l2 = nn.Linear(4 * config.n_embd, config.n_embd) + self.drop = nn.Dropout(config.resid_pdrop) + + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + layer_past: Optional[tuple[torch.Tensor]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ): + residual = hidden_states + hidden_states = self.ln1(hidden_states) + + attn_outputs = self.attn( + hidden_states, layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions + ) + attn_output = attn_outputs[0] + outputs = attn_outputs[1:] + hidden_states = attn_output + residual + + residual = hidden_states + hidden_states = self.ln2(hidden_states) + hidden_states = self.l1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.l2(hidden_states) + hidden_states = residual + self.drop(hidden_states) + + if use_cache: + outputs = (hidden_states,) + outputs + else: + outputs = (hidden_states,) + outputs[1:] + + return outputs + + +@add_start_docstrings( + "The bare TrajectoryTransformer Model transformer outputting raw hidden-states without any specific head on top.", + TRAJECTORY_TRANSFORMER_START_DOCSTRING, +) +class TrajectoryTransformerModel(TrajectoryTransformerPreTrainedModel): + """the full GPT language model, with a context size of block_size""" + + def __init__(self, config): + super().__init__(config) + + # input embedding stem (+1 for stop token) + self.tok_emb = nn.Embedding(config.vocab_size * config.transition_dim + 1, config.n_embd) + + self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd)) + self.drop = nn.Dropout(config.embd_pdrop) + # transformer + self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]) + # decoder head + self.ln_f = nn.LayerNorm(config.n_embd) + self.head = EinLinear(config.transition_dim, config.n_embd, config.vocab_size + 1, bias=False) + + self.vocab_size = config.vocab_size + self.stop_token = config.vocab_size * config.transition_dim + self.block_size = config.block_size + + self.observation_dim = config.observation_dim + self.action_dim = config.action_dim + self.transition_dim = config.transition_dim + self.embedding_dim = config.n_embd + + self.action_weight = config.action_weight + self.reward_weight = config.reward_weight + self.value_weight = config.value_weight + + self.gradient_checkpointing = False + + self.post_init() + + def get_block_size(self): + return self.block_size + + def offset_tokens(self, trajectories): + _, sequence_length = trajectories.shape + + n_states = int(np.ceil(sequence_length / self.transition_dim)) + + offsets = torch.arange(self.transition_dim) * self.vocab_size + offsets = offsets.repeat(n_states).to(trajectories.device) + + offset_trajectories = trajectories + offsets[:sequence_length] + offset_trajectories[trajectories == self.vocab_size] = self.stop_token + return offset_trajectories + + def pad_to_full_observation(self, hidden_states): + batch_size, sequence_length, _ = hidden_states.shape + + n_pad = (self.transition_dim - sequence_length % self.transition_dim) % self.transition_dim + padding = torch.zeros(batch_size, n_pad, self.embedding_dim, device=hidden_states.device) + + # [ batch_size x padded_sequence_length' x embedding_dim ] + hidden_states_pad = torch.cat([hidden_states, padding], dim=1) + hidden_states_pad = hidden_states_pad.view(-1, self.transition_dim, self.embedding_dim) + + return hidden_states_pad, n_pad + + @add_start_docstrings_to_model_forward( + TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length") + ) + @replace_return_docstrings(output_type=TrajectoryTransformerOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + trajectories: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + targets: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TrajectoryTransformerOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import TrajectoryTransformerModel + >>> import torch + + >>> model = TrajectoryTransformerModel.from_pretrained( + ... "CarlCochet/trajectory-transformer-halfcheetah-medium-v2" + ... ) + >>> model.to(device) + >>> model.eval() + + >>> observations_dim, action_dim, batch_size = 17, 6, 256 + >>> seq_length = observations_dim + action_dim + 1 + + >>> trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to( + ... device + ... ) + >>> targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to(device) + + >>> outputs = model( + ... trajectories, + ... targets=targets, + ... use_cache=True, + ... output_attentions=True, + ... output_hidden_states=True, + ... return_dict=True, + ... ) + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if past_key_values is None: + past_key_values = tuple([None] * len(self.blocks)) + + batch_size, sequence_length = trajectories.size() + + if sequence_length > self.block_size: + raise ValueError("Cannot forward, model block size is exhausted.") + + offset_trajectories = self.offset_tokens(trajectories) + # [ batch_size x sequence_length x embedding_dim ] + # forward the GPT model + token_embeddings = self.tok_emb(offset_trajectories) # each index maps to a (learnable) vector + position_embeddings = self.pos_emb[:, :sequence_length, :] # each position maps to a (learnable) vector + + hidden_states = self.drop(token_embeddings + position_embeddings) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + presents = () if use_cache else None + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block(hidden_states, layer_past, use_cache, output_attentions) + + hidden_states = outputs[0] + if use_cache is True: + presents = presents + (outputs[1],) + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) + + # [ batch_size x sequence_length x embedding_dim ] + hidden_state = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states_pad, n_pad = self.pad_to_full_observation(hidden_state) + + logits = self.head(hidden_states_pad) + logits = logits.reshape(batch_size, sequence_length + n_pad, self.vocab_size + 1) + logits = logits[:, :sequence_length] + + # if we are given some desired targets also calculate the loss + if targets is not None: + loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.view(-1), reduction="none") + if self.action_weight != 1 or self.reward_weight != 1 or self.value_weight != 1: + # make weights + n_states = int(np.ceil(sequence_length / self.transition_dim)) + weights = torch.cat( + [ + torch.ones(self.observation_dim, device=trajectories.device), + torch.ones(self.action_dim, device=trajectories.device) * self.action_weight, + torch.ones(1, device=trajectories.device) * self.reward_weight, + torch.ones(1, device=trajectories.device) * self.value_weight, + ] + ) + weights = weights.repeat(n_states) + weights = weights[1:].repeat(batch_size, 1) + loss = loss * weights.view(-1) + loss = (loss * attention_mask.view(-1)).mean() + else: + loss = None + + if not return_dict: + return tuple(v for v in [loss, logits, presents, all_hidden_states, all_self_attentions] if v is not None) + + return TrajectoryTransformerOutput( + loss=loss, + logits=logits, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +__all__ = [ + "TrajectoryTransformerModel", + "TrajectoryTransformerPreTrainedModel", + "load_tf_weights_in_trajectory_transformer", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0ac9a2cbf4766bd187d90fdaf46505db3e92b68f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_transfo_xl import * + from .modeling_tf_transfo_xl import * + from .modeling_transfo_xl import * + from .tokenization_transfo_xl import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py new file mode 100644 index 0000000000000000000000000000000000000000..b7926246c4cbb37904fb3044507f4831bb7d9bb9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py @@ -0,0 +1,189 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Transformer XL configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class TransfoXLConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`TransfoXLModel`] or a [`TFTransfoXLModel`]. It is + used to instantiate a Transformer-XL model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the TransfoXL + [transfo-xl/transfo-xl-wt103](https://huggingface.co/transfo-xl/transfo-xl-wt103) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 267735): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`TransfoXLModel`] or [`TFTransfoXLModel`]. + cutoffs (`list[int]`, *optional*, defaults to `[20000, 40000, 200000]`): + Cutoffs for the adaptive softmax. + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the model's hidden states. + d_embed (`int`, *optional*, defaults to 1024): + Dimensionality of the embeddings + n_head (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + d_head (`int`, *optional*, defaults to 64): + Dimensionality of the model's heads. + d_inner (`int`, *optional*, defaults to 4096): + Inner dimension in FF + div_val (`int`, *optional*, defaults to 4): + Divident value for adaptive input and softmax + pre_lnorm (`boolean`, *optional*, defaults to `False`): + Whether or not to apply LayerNorm to the input instead of the output in the blocks. + n_layer (`int`, *optional*, defaults to 18): + Number of hidden layers in the Transformer encoder. + mem_len (`int`, *optional*, defaults to 1600): + Length of the retained previous heads. + clamp_len (`int`, *optional*, defaults to 1000): + Use the same pos embeddings after clamp_len. + same_length (`boolean`, *optional*, defaults to `True`): + Whether or not to use the same attn length for all tokens + proj_share_all_but_first (`boolean`, *optional*, defaults to `True`): + True to share all but first projs, False not to share. + attn_type (`int`, *optional*, defaults to 0): + Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. + sample_softmax (`int`, *optional*, defaults to -1): + Number of samples in the sampled softmax. + adaptive (`boolean`, *optional*, defaults to `True`): + Whether or not to use adaptive softmax. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + dropatt (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + untie_r (`boolean`, *optional*, defaults to `True`): + Whether ot not to untie relative position biases. + init (`str`, *optional*, defaults to `"normal"`): + Parameter initializer to use. + init_range (`float`, *optional*, defaults to 0.01): + Parameters initialized by U(-init_range, init_range). + proj_init_std (`float`, *optional*, defaults to 0.01): + Parameters initialized by N(0, init_std) + init_std (`float`, *optional*, defaults to 0.02): + Parameters initialized by N(0, init_std) + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): + The epsilon to use in the layer normalization layers + eos_token_id (`int`, *optional*, defaults to 0): + End of stream token id. + + Examples: + + ```python + >>> from transformers import TransfoXLConfig, TransfoXLModel + + >>> # Initializing a Transformer XL configuration + >>> configuration = TransfoXLConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = TransfoXLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "transfo-xl" + keys_to_ignore_at_inference = ["mems"] + attribute_map = { + "n_token": "vocab_size", + "hidden_size": "d_model", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=267735, + cutoffs=[20000, 40000, 200000], + d_model=1024, + d_embed=1024, + n_head=16, + d_head=64, + d_inner=4096, + div_val=4, + pre_lnorm=False, + n_layer=18, + mem_len=1600, + clamp_len=1000, + same_length=True, + proj_share_all_but_first=True, + attn_type=0, + sample_softmax=-1, + adaptive=True, + dropout=0.1, + dropatt=0.0, + untie_r=True, + init="normal", + init_range=0.01, + proj_init_std=0.01, + init_std=0.02, + layer_norm_epsilon=1e-5, + eos_token_id=0, + **kwargs, + ): + self.vocab_size = vocab_size + self.cutoffs = [] + self.cutoffs.extend(cutoffs) + if proj_share_all_but_first: + self.tie_projs = [False] + [True] * len(self.cutoffs) + else: + self.tie_projs = [False] + [False] * len(self.cutoffs) + self.d_model = d_model + self.d_embed = d_embed + self.d_head = d_head + self.d_inner = d_inner + self.div_val = div_val + self.pre_lnorm = pre_lnorm + self.n_layer = n_layer + self.n_head = n_head + self.mem_len = mem_len + self.same_length = same_length + self.attn_type = attn_type + self.clamp_len = clamp_len + self.sample_softmax = sample_softmax + self.adaptive = adaptive + self.dropout = dropout + self.dropatt = dropatt + self.untie_r = untie_r + self.init = init + self.init_range = init_range + self.proj_init_std = proj_init_std + self.init_std = init_std + self.layer_norm_epsilon = layer_norm_epsilon + super().__init__(eos_token_id=eos_token_id, **kwargs) + + @property + def max_position_embeddings(self): + # Message copied from Transformer-XL documentation + logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.") + return -1 + + @max_position_embeddings.setter + def max_position_embeddings(self, value): + # Message copied from Transformer-XL documentation + raise NotImplementedError( + f"The model {self.model_type} is one of the few models that has no sequence length limit." + ) + + +__all__ = ["TransfoXLConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py new file mode 100644 index 0000000000000000000000000000000000000000..3c7830d633448bde3b8a420f0d84cdd070498882 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py @@ -0,0 +1,1128 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TF 2.0 Transformer XL model. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np +import tensorflow as tf + +from ....modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ....tf_utils import shape_list, stable_softmax +from ....utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_transfo_xl import TransfoXLConfig +from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103" +_CONFIG_FOR_DOC = "TransfoXLConfig" + + +class TFPositionalEmbedding(keras.layers.Layer): + def __init__(self, demb, **kwargs): + super().__init__(**kwargs) + + self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb)) + + def call(self, pos_seq, bsz=None): + self.inv_freq = tf.cast(self.inv_freq, dtype=pos_seq.dtype) + sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq) + pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1) + + if bsz is not None: + return tf.tile(pos_emb[:, None, :], [1, bsz, 1]) + else: + return pos_emb[:, None, :] + + +class TFPositionwiseFF(keras.layers.Layer): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): + super().__init__(**kwargs) + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.layer_1 = keras.layers.Dense( + d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0" + ) + self.drop_1 = keras.layers.Dropout(dropout) + self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3") + self.drop_2 = keras.layers.Dropout(dropout) + + self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") + + self.pre_lnorm = pre_lnorm + + def call(self, inp, training=False): + if self.pre_lnorm: + # layer normalization + positionwise feed-forward + core_out = self.layer_norm(inp) + core_out = self.layer_1(core_out) + core_out = self.drop_1(core_out, training=training) + core_out = self.layer_2(core_out) + core_out = self.drop_2(core_out, training=training) + + # residual connection + output = core_out + inp + else: + # positionwise feed-forward + core_out = self.layer_1(inp) + core_out = self.drop_1(core_out, training=training) + core_out = self.layer_2(core_out) + core_out = self.drop_2(core_out, training=training) + + # residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + return output + + +class TFRelPartialLearnableMultiHeadAttn(keras.layers.Layer): + def __init__( + self, + n_head, + d_model, + d_head, + dropout, + dropatt=0.0, + pre_lnorm=False, + r_r_bias=None, + r_w_bias=None, + layer_norm_epsilon=1e-5, + init_std=0.02, + output_attentions=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + self.output_attentions = output_attentions + + self.qkv_net = keras.layers.Dense( + 3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net" + ) + + self.drop = keras.layers.Dropout(dropout) + self.dropatt = keras.layers.Dropout(dropatt) + self.o_net = keras.layers.Dense( + d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net" + ) + + self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") + + self.scale = 1 / (d_head**0.5) + + self.pre_lnorm = pre_lnorm + + if r_r_bias is not None and r_w_bias is not None: # Biases are shared + self.r_r_bias = r_r_bias + self.r_w_bias = r_w_bias + else: + self.r_r_bias = None + self.r_w_bias = None + + self.r_net = keras.layers.Dense( + self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net" + ) + + def build(self, input_shape): + if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared + self.r_r_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" + ) + self.r_w_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" + ) + super().build(input_shape) + + def _rel_shift(self, x): + x_size = shape_list(x) + + x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]]) + x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]]) + x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1]) + x = tf.reshape(x, x_size) + + return x + + def call(self, w, r, attn_mask, mems, head_mask, output_attentions, training=False): + qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1] + + if mems is not None: + mems = tf.cast(mems, dtype=w.dtype) + cat = tf.concat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) + + klen = shape_list(w_head_k)[0] + + w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head + w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head + w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head + + r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head + + # compute attention score + rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head + AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k) # qlen x klen x bsz x n_head + + rr_head_q = w_head_q + self.r_r_bias + BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k) # qlen x klen x bsz x n_head + BD = self._rel_shift(BD) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score = attn_score * self.scale + + # compute attention probability + if attn_mask is not None: + attn_mask_t = attn_mask[:, :, None, None] + attn_mask_t = tf.cast(attn_mask_t, dtype=attn_score.dtype) + attn_score = attn_score * (1.0 - attn_mask_t) - 1e30 * attn_mask_t + + # [qlen x klen x bsz x n_head] + attn_prob = stable_softmax(attn_score, axis=1) + attn_prob = self.dropatt(attn_prob, training=training) + + # Mask heads if we want to + if head_mask is not None: + attn_prob = attn_prob * head_mask + + # compute attention vector + attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v) + + # [qlen x bsz x n_head x d_head] + attn_vec_sizes = shape_list(attn_vec) + attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head)) + + # linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out, training=training) + + if self.pre_lnorm: + # residual connection + outputs = [w + attn_out] + else: + # residual connection + layer normalization + outputs = [self.layer_norm(w + attn_out)] + + if output_attentions: + outputs.append(attn_prob) + + return outputs + + +class TFRelPartialLearnableDecoderLayer(keras.layers.Layer): + def __init__( + self, + n_head, + d_model, + d_head, + d_inner, + dropout, + dropatt=0.0, + pre_lnorm=False, + r_w_bias=None, + r_r_bias=None, + layer_norm_epsilon=1e-5, + init_std=0.02, + output_attentions=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.dec_attn = TFRelPartialLearnableMultiHeadAttn( + n_head, + d_model, + d_head, + dropout, + dropatt=dropatt, + pre_lnorm=pre_lnorm, + r_w_bias=r_w_bias, + r_r_bias=r_r_bias, + init_std=init_std, + layer_norm_epsilon=layer_norm_epsilon, + output_attentions=output_attentions, + name="dec_attn", + ) + self.pos_ff = TFPositionwiseFF( + d_model, + d_inner, + dropout, + pre_lnorm=pre_lnorm, + init_std=init_std, + layer_norm_epsilon=layer_norm_epsilon, + name="pos_ff", + ) + + def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False): + attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training) + ff_output = self.pos_ff(attn_outputs[0], training=training) + + outputs = [ff_output] + attn_outputs[1:] + + return outputs + + +class TFTransfoEmbeddings(keras.layers.Layer): + def __init__(self, vocab_size, emb_size, init_std, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.emb_size = emb_size + self.init_std = init_std + + def build(self, input_shape): + self.weight = self.add_weight( + shape=(self.vocab_size, self.emb_size), + initializer=get_initializer(self.init_std), + name="embeddings", + ) + + super().build(input_shape) + + def call(self, inputs): + return tf.gather(self.weight, inputs) + + +class TFAdaptiveEmbedding(keras.layers.Layer): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): + super().__init__(**kwargs) + + self.n_token = n_token + self.d_embed = d_embed + self.init_std = init_std + + self.cutoffs = cutoffs + [n_token] + self.div_val = div_val + self.d_proj = d_proj + + self.emb_scale = d_proj**0.5 + + self.cutoff_ends = [0] + self.cutoffs + + self.emb_layers = [] + self.emb_projs = [] + + if div_val == 1: + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + d_emb_i = d_embed // (div_val**i) + self.emb_layers.append( + TFTransfoEmbeddings( + r_idx - l_idx, + d_emb_i, + init_std, + name=f"emb_layers_._{i}", + ) + ) + + def build(self, input_shape): + for i in range(len(self.cutoffs)): + d_emb_i = self.d_embed // (self.div_val**i) + self.emb_projs.append( + self.add_weight( + shape=(d_emb_i, self.d_proj), + initializer=get_initializer(self.init_std), + trainable=True, + name=f"emb_projs_._{i}", + ) + ) + + super().build(input_shape) + + def call(self, inp): + if self.div_val == 1: + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + else: + inp_flat = tf.reshape(inp, (-1,)) + emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj]) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + + mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) + + inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx + emb_i = self.emb_layers[i](inp_i) + emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i]) + + mask_idx = tf.where(mask_i) + scatter = tf.scatter_nd(mask_idx, emb_i, shape_list(emb_flat)) + emb_flat = tf.cast(emb_flat, dtype=scatter.dtype) + emb_flat += scatter + + embed_shape = shape_list(inp) + [self.d_proj] + embed = tf.reshape(emb_flat, embed_shape) + + embed *= self.emb_scale + + return embed + + +@keras_serializable +class TFTransfoXLMainLayer(keras.layers.Layer): + config_class = TransfoXLConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions + self.return_dict = config.use_return_dict + + self.n_token = config.vocab_size + + self.d_embed = config.d_embed + self.d_model = config.d_model + self.n_head = config.n_head + self.d_head = config.d_head + self.untie_r = config.untie_r + + self.word_emb = TFAdaptiveEmbedding( + config.vocab_size, + config.d_embed, + config.d_model, + config.cutoffs, + div_val=config.div_val, + init_std=config.init_std, + name="word_emb", + ) + + self.drop = keras.layers.Dropout(config.dropout) + + self.n_layer = config.n_layer + self.mem_len = config.mem_len + self.attn_type = config.attn_type + + self.layers = [] + if config.attn_type == 0: # the default attention + for i in range(config.n_layer): + self.layers.append( + TFRelPartialLearnableDecoderLayer( + config.n_head, + config.d_model, + config.d_head, + config.d_inner, + config.dropout, + dropatt=config.dropatt, + pre_lnorm=config.pre_lnorm, + r_w_bias=None if self.untie_r else self.r_w_bias, + r_r_bias=None if self.untie_r else self.r_r_bias, + layer_norm_epsilon=config.layer_norm_epsilon, + init_std=config.init_std, + output_attentions=self.output_attentions, + name=f"layers_._{i}", + ) + ) + else: # learnable embeddings and absolute embeddings + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + + self.same_length = config.same_length + self.clamp_len = config.clamp_len + + if self.attn_type == 0: # default attention + self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb") + else: # learnable embeddings and absolute embeddings + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + + def build(self, input_shape): + if not self.untie_r: + self.r_w_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" + ) + self.r_r_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" + ) + super().build(input_shape) + + def get_input_embeddings(self): + return self.word_emb + + def set_input_embeddings(self, value): + raise NotImplementedError + + def backward_compatible(self): + self.sample_softmax = -1 + + def reset_memory_length(self, mem_len): + self.mem_len = mem_len + + def _prune_heads(self, heads): + raise NotImplementedError + + def init_mems(self, bsz): + if self.mem_len > 0: + mems = [] + for i in range(self.n_layer): + empty = tf.zeros([self.mem_len, bsz, self.d_model]) + mems.append(empty) + + return mems + else: + return None + + def _update_mems(self, hids, mems, mlen, qlen): + # does not deal with None + if mems is None: + return None + + # mems is not None + assert len(hids) == len(mems), "len(hids) != len(mems)" + + # There are `mlen + qlen` steps that can be cached into mems + new_mems = [] + end_idx = mlen + tf.math.maximum(0, qlen) + beg_idx = tf.math.maximum(0, end_idx - tf.convert_to_tensor(self.mem_len)) + for i in range(len(hids)): + mems[i] = tf.cast(mems[i], dtype=hids[i].dtype) + cat = tf.concat([mems[i], hids[i]], axis=0) + tf.stop_gradient(cat) + new_mems.append(cat[beg_idx:end_idx]) + + return new_mems + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + mems: list[tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ): + # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library + # so we transpose here from shape [bsz, len] to shape [len, bsz] + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_ids = tf.transpose(input_ids, perm=(1, 0)) + qlen, bsz = shape_list(input_ids) + elif inputs_embeds is not None: + inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2)) + qlen, bsz = shape_list(inputs_embeds)[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if mems is None: + mems = self.init_mems(bsz) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) + # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.n_layer + + if inputs_embeds is not None: + word_emb = inputs_embeds + else: + word_emb = self.word_emb(input_ids) + + mlen = shape_list(mems[0])[0] if mems is not None else 0 + klen = mlen + qlen + + # Compute decoder attention mask + all_ones = tf.ones([qlen, klen], dtype=tf.int32) + upper_mask = 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, mlen) + if self.same_length: + mask_len = klen - self.mem_len + mask_shift_len = qlen - tf.nn.relu(mask_len) # Lazy clamping of negatives to zero + + # Use an indicator variable instead of a conditional to keep the compiler happy + lower_mask = tf.linalg.band_part(all_ones, -1, 0) - ( + tf.linalg.band_part(all_ones, mask_shift_len - 1, 0) * tf.cast(mask_shift_len != 0, tf.int32) + ) + dec_attn_mask = upper_mask + lower_mask + else: + dec_attn_mask = upper_mask + + hids = [] + attentions = [] if output_attentions else None + if self.attn_type == 0: # default + pos_seq = tf.range(klen - 1, -1, -1.0) + if self.clamp_len > 0: + pos_seq = tf.minimum(pos_seq, self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb, training=training) + pos_emb = self.drop(pos_emb, training=training) + + for i, layer in enumerate(self.layers): + hids.append(core_out) + mems_i = None if mems is None else mems[i] + layer_outputs = layer( + core_out, + pos_emb, + dec_attn_mask, + mems_i, + head_mask[i], + output_attentions, + training=training, + ) + core_out = layer_outputs[0] + if output_attentions: + attentions.append(layer_outputs[1]) + else: # learnable embeddings and absolute embeddings + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + + core_out = self.drop(core_out, training=training) + + new_mems = self._update_mems(hids, mems, mlen, qlen) + + # We transpose back here to shape [bsz, len, hidden_dim] + core_out = tf.transpose(core_out, perm=(1, 0, 2)) + + if output_hidden_states: + # Transpose to library standard shape [bsz, len, hidden_dim] and add last layer + hids = tuple(tf.transpose(t, perm=(1, 0, 2)) for t in hids) + hids = hids + (core_out,) + else: + hids = None + if output_attentions: + # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] + attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) + + if not return_dict: + return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) + + return TFTransfoXLModelOutput( + last_hidden_state=core_out, + mems=new_mems, + hidden_states=hids, + attentions=attentions, + ) + + +class TFTransfoXLPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = TransfoXLConfig + base_model_prefix = "transformer" + + +@dataclass +class TFTransfoXLModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + mems (`list[tf.Tensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` + input) to speed up sequential decoding. The token ids which have their past given to this model should not + be passed as input ids as they have already been computed. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor | None = None + mems: list[tf.Tensor] = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +@dataclass +class TFTransfoXLLMHeadModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + losses (`tf.Tensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided): + Language modeling losses (not reduced). + prediction_scores (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax). + mems (`list[tf.Tensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` + input) to speed up sequential decoding. The token ids which have their past given to this model should not + be passed as input ids as they have already been computed. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_scores: tf.Tensor | None = None + mems: list[tf.Tensor] = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +@dataclass +class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + mems (`list[tf.Tensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` + input) to speed up sequential decoding. The token ids which have their past given to this model should not + be passed as input ids as they have already been computed. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: tf.Tensor | None = None + mems: list[tf.Tensor] = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +TRANSFO_XL_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`TransfoXLConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TRANSFO_XL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + mems (`list[tf.Tensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see + `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems + given to this model should not be passed as `input_ids` as they have already been computed. + head_mask (`tf.Tensor` or `Numpy array` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + TRANSFO_XL_START_DOCSTRING, +) +class TFTransfoXLModel(TFTransfoXLPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFTransfoXLMainLayer(config, name="transformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTransfoXLModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + mems: list[tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFTransfoXLModelOutput | tuple[tf.Tensor]: + outputs = self.transformer( + input_ids=input_ids, + mems=mems, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + +@add_start_docstrings( + """ + The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive + input embeddings) + """, + TRANSFO_XL_START_DOCSTRING, +) +class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.transformer = TFTransfoXLMainLayer(config, name="transformer") + self.sample_softmax = config.sample_softmax + assert self.sample_softmax <= 0, ( + "Sampling from the softmax is not implemented yet. Please look at issue: #3310:" + " https://github.com/huggingface/transformers/issues/3310" + ) + + self.crit = TFAdaptiveSoftmaxMask( + config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit" + ) + + def _resize_token_embeddings(self, new_num_tokens): + raise NotImplementedError() + + def get_output_embeddings(self): + """Double-check if you are using adaptive softmax.""" + if len(self.crit.out_layers) > 0: + return self.crit.out_layers[-1] + return None + + def reset_memory_length(self, mem_len): + self.transformer.reset_memory_length(mem_len) + + def init_mems(self, bsz): + return self.transformer.init_mems(bsz) + + @unpack_inputs + @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTransfoXLLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + mems: list[tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFTransfoXLLMHeadModelOutput | tuple[tf.Tensor]: + if input_ids is not None: + bsz, tgt_len = shape_list(input_ids)[:2] + else: + bsz, tgt_len = shape_list(inputs_embeds)[:2] + + transformer_outputs = self.transformer( + input_ids, + mems, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict, + training=training, + ) + + last_hidden = transformer_outputs[0] + pred_hid = last_hidden[:, -tgt_len:] + + softmax_output = self.crit(pred_hid, labels, training=training) + prediction_scores = softmax_output if labels is None else () + + if not return_dict: + return (prediction_scores,) + transformer_outputs[1:] + + return TFTransfoXLLMHeadModelOutput( + prediction_scores=prediction_scores, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs): + inputs = {} + + # if past is defined in model kwargs then use it for faster decoding + if past_key_values: + input_ids = tf.expand_dims(input_ids[:, -1], axis=-1) + else: + input_ids = input_ids + + return inputs + + # Adapted from the torch tie_weights function + def tf_to_pt_weight_rename(self, tf_weight): + if self.config.tie_word_embeddings and "crit.out_layers" in tf_weight: + return tf_weight, tf_weight.replace("crit.out_layers", "transformer.word_emb.emb_layers") + elif self.config.tie_projs and "crit.out_projs" in tf_weight: + for i, tie_proj in enumerate(self.config.tie_projs): + if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: + # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] + return tf_weight, tf_weight.replace(f"crit.out_projs.{i}", "transformer.word_emb.emb_projs.0") + elif tie_proj and self.config.div_val != 1: + # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] + return tf_weight, tf_weight.replace("crit.out_projs", "transformer.word_emb.emb_projs") + else: + return (tf_weight,) + + +@add_start_docstrings( + """ + The Transfo XL Model transformer with a sequence classification head on top (linear layer). + + [`TFTransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal + models (e.g. GPT-1,GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + TRANSFO_XL_START_DOCSTRING, +) +class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.score = keras.layers.Dense( + config.num_labels, + kernel_initializer=get_initializer(config.init_range), + name="score", + use_bias=False, + ) + self.transformer = TFTransfoXLMainLayer(config, name="transformer") + + def get_output_embeddings(self): + # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too. + logger.warning( + "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed " + "in transformers v4.32." + ) + return self.transformer.word_emb + + @unpack_inputs + @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTransfoXLSequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + mems: list[tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFTransfoXLSequenceClassifierOutputWithPast: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + mems=mems, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + in_logits = None + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = ( + tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1) + - 1 + ) + sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1) + in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1) + else: + sequence_lengths = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + loss = None + + if labels is not None: + if input_ids is not None: + batch_size, sequence_length = shape_list(input_ids)[:2] + else: + batch_size, sequence_length = shape_list(inputs_embeds)[:2] + assert self.config.pad_token_id is not None or batch_size == 1, ( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + + if not tf.is_tensor(sequence_lengths): + in_logits = logits[0:batch_size, sequence_lengths] + + loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels])) + + pooled_logits = in_logits if in_logits is not None else logits + + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTransfoXLSequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = [ + "TFAdaptiveEmbedding", + "TFTransfoXLForSequenceClassification", + "TFTransfoXLLMHeadModel", + "TFTransfoXLMainLayer", + "TFTransfoXLModel", + "TFTransfoXLPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..48205e06fb20a473959544db4971dff0d3e58cbf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py @@ -0,0 +1,178 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A TF 2.0 Adaptive Softmax for Transformer XL model. +""" + +import tensorflow as tf + +from ....modeling_tf_utils import keras +from ....tf_utils import shape_list + + +class TFAdaptiveSoftmaxMask(keras.layers.Layer): + def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.d_embed = d_embed + self.d_proj = d_proj + + self.cutoffs = cutoffs + [vocab_size] + self.cutoff_ends = [0] + self.cutoffs + self.div_val = div_val + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + self.keep_order = keep_order + + self.out_layers = [] + self.out_projs = [] + + def build(self, input_shape): + if self.n_clusters > 0: + self.cluster_weight = self.add_weight( + shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight" + ) + self.cluster_bias = self.add_weight( + shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias" + ) + + if self.div_val == 1: + for i in range(len(self.cutoffs)): + if self.d_proj != self.d_embed: + weight = self.add_weight( + shape=(self.d_embed, self.d_proj), + initializer="zeros", + trainable=True, + name=f"out_projs_._{i}", + ) + self.out_projs.append(weight) + else: + self.out_projs.append(None) + weight = self.add_weight( + shape=(self.vocab_size, self.d_embed), + initializer="zeros", + trainable=True, + name=f"out_layers_._{i}_._weight", + ) + bias = self.add_weight( + shape=(self.vocab_size,), + initializer="zeros", + trainable=True, + name=f"out_layers_._{i}_._bias", + ) + self.out_layers.append((weight, bias)) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + d_emb_i = self.d_embed // (self.div_val**i) + + weight = self.add_weight( + shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name=f"out_projs_._{i}" + ) + self.out_projs.append(weight) + weight = self.add_weight( + shape=(r_idx - l_idx, d_emb_i), + initializer="zeros", + trainable=True, + name=f"out_layers_._{i}_._weight", + ) + bias = self.add_weight( + shape=(r_idx - l_idx,), + initializer="zeros", + trainable=True, + name=f"out_layers_._{i}_._bias", + ) + self.out_layers.append((weight, bias)) + super().build(input_shape) + + @staticmethod + def _logit(x, W, b, proj=None): + y = x + if proj is not None: + y = tf.einsum("ibd,ed->ibe", y, proj) + return tf.einsum("ibd,nd->ibn", y, W) + b + + @staticmethod + def _gather_logprob(logprob, target): + lp_size = shape_list(logprob) + r = tf.range(lp_size[0], dtype=target.dtype) + idx = tf.stack([r, target], 1) + return tf.gather_nd(logprob, idx) + + def call(self, hidden, target, return_mean=True, training=False): + head_logprob = 0 + if self.n_clusters == 0: + output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) + if target is not None: + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) + out = tf.nn.log_softmax(output, axis=-1) + else: + hidden_sizes = shape_list(hidden) + out = [] + loss = tf.zeros(hidden_sizes[:2]) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + if target is not None: + mask = (target >= l_idx) & (target < r_idx) + mask_idx = tf.where(mask) + cur_target = tf.boolean_mask(target, mask) - l_idx + + if self.div_val == 1: + cur_W = self.out_layers[0][0][l_idx:r_idx] + cur_b = self.out_layers[0][1][l_idx:r_idx] + else: + cur_W = self.out_layers[i][0] + cur_b = self.out_layers[i][1] + + if i == 0: + cur_W = tf.concat([cur_W, self.cluster_weight], 0) + cur_b = tf.concat([cur_b, self.cluster_bias], 0) + + head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0]) + head_logprob = tf.nn.log_softmax(head_logit) + out.append(head_logprob[..., : self.cutoffs[0]]) + if target is not None: + cur_head_logprob = tf.boolean_mask(head_logprob, mask) + cur_logprob = self._gather_logprob(cur_head_logprob, cur_target) + else: + tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i]) + tail_logprob = tf.nn.log_softmax(tail_logit) + cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster + logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob + out.append(logprob_i) + if target is not None: + cur_head_logprob = tf.boolean_mask(head_logprob, mask) + cur_tail_logprob = tf.boolean_mask(tail_logprob, mask) + cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target) + cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1] + if target is not None: + loss += tf.scatter_nd(mask_idx, -cur_logprob, shape_list(loss)) + out = tf.concat(out, axis=-1) + + if target is not None: + if return_mean: + loss = tf.reduce_mean(loss) + # Add the training-time loss value to the layer using `self.add_loss()`. + self.add_loss(loss) + + # Log the loss as a metric (we could log arbitrary metrics, + # including different metrics for training and inference. + self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "") + + return out diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py new file mode 100644 index 0000000000000000000000000000000000000000..49d07391320d0fbc93c969cc72eb27b3e5e4ed0a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py @@ -0,0 +1,1303 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular +https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py +""" + +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....modeling_utils import PreTrainedModel +from ....utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_transfo_xl import TransfoXLConfig +from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103" +_CONFIG_FOR_DOC = "TransfoXLConfig" + + +def build_tf_to_pytorch_map(model, config): + """ + A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original + PyTorch model as possible. + """ + tf_to_pt_map = {} + + if hasattr(model, "transformer"): + # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax + tf_to_pt_map.update( + { + "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight, + "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias, + } + ) + for i, (out_l, proj_l, tie_proj) in enumerate( + zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs) + ): + layer_str = f"transformer/adaptive_softmax/cutoff_{i}/" + if config.tie_word_embeddings: + tf_to_pt_map.update({layer_str + "b": out_l.bias}) + else: + raise NotImplementedError + # I don't think this is implemented in the TF code + tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias}) + if not tie_proj: + tf_to_pt_map.update({layer_str + "proj": proj_l}) + # Now load the rest of the transformer + model = model.transformer + + # Embeddings + for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)): + layer_str = f"transformer/adaptive_embed/cutoff_{i}/" + tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l}) + + # Transformer blocks + for i, b in enumerate(model.layers): + layer_str = f"transformer/layer_{i}/" + tf_to_pt_map.update( + { + layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, + layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias, + layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight, + layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight, + layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight, + layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight, + layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias, + layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight, + layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias, + layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight, + layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias, + } + ) + + # Relative positioning biases + if config.untie_r: + r_r_list = [] + r_w_list = [] + for b in model.layers: + r_r_list.append(b.dec_attn.r_r_bias) + r_w_list.append(b.dec_attn.r_w_bias) + else: + r_r_list = [model.r_r_bias] + r_w_list = [model.r_w_bias] + tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list}) + return tf_to_pt_map + + +def load_tf_weights_in_transfo_xl(model, config, tf_path): + """Load tf checkpoints in a pytorch model""" + try: + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + # Build TF to PyTorch weights loading map + tf_to_pt_map = build_tf_to_pytorch_map(model, config) + + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + tf_weights = {} + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + tf_weights[name] = array + + for name, pointer in tf_to_pt_map.items(): + assert name in tf_weights + array = tf_weights[name] + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if "kernel" in name or "proj" in name: + array = np.transpose(array) + if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1: + # Here we will split the TF weights + assert len(pointer) == array.shape[0] + for i, p_i in enumerate(pointer): + arr_i = array[i, ...] + try: + assert p_i.shape == arr_i.shape + except AssertionError as e: + e.args += (p_i.shape, arr_i.shape) + raise + logger.info(f"Initialize PyTorch weight {name} for layer {i}") + p_i.data = torch.from_numpy(arr_i) + else: + try: + assert pointer.shape == array.shape, ( + f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" + ) + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + tf_weights.pop(name, None) + tf_weights.pop(name + "/Adam", None) + tf_weights.pop(name + "/Adam_1", None) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}") + return model + + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super().__init__() + + self.demb = demb + + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.outer(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[:, None, :].expand(-1, bsz, -1) + else: + return pos_emb[:, None, :] + + +class PositionwiseFF(nn.Module): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5): + super().__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Linear(d_model, d_inner), + nn.ReLU(inplace=True), + nn.Dropout(dropout), + nn.Linear(d_inner, d_model), + nn.Dropout(dropout), + ) + + self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) + + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + # layer normalization + positionwise feed-forward + core_out = self.CoreNet(self.layer_norm(inp)) + + # residual connection + output = core_out + inp + else: + # positionwise feed-forward + core_out = self.CoreNet(inp) + + # residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + return output + + +class RelPartialLearnableMultiHeadAttn(nn.Module): + def __init__( + self, + n_head, + d_model, + d_head, + dropout, + dropatt=0, + pre_lnorm=False, + r_r_bias=None, + r_w_bias=None, + layer_norm_epsilon=1e-5, + ): + super().__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) + + self.scale = 1 / (d_head**0.5) + + self.pre_lnorm = pre_lnorm + + if r_r_bias is None or r_w_bias is None: # Biases are not shared + self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + else: + self.r_r_bias = r_r_bias + self.r_w_bias = r_w_bias + + self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) + + def _rel_shift(self, x): + zero_pad_shape = (x.size(0), 1) + x.size()[2:] + zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=1) + + x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:] + x_padded = x_padded.view(*x_padded_shape) + + x = x_padded[1:].view_as(x) + + return x + + def forward(self, w, r, attn_mask=None, mems=None, head_mask=None, output_attentions=False): + qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + + r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head + + # compute attention score + rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head + AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + + rr_head_q = w_head_q + self.r_r_bias + BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head + BD = self._rel_shift(BD) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + mask_value = torch.finfo(attn_score.dtype).min + + # compute attention probability + if attn_mask is not None and torch.sum(attn_mask).item(): + attn_mask = attn_mask == 1 # Switch to bool + if attn_mask.dim() == 2: + attn_score = ( + attn_score.float().masked_fill(attn_mask[None, :, :, None], mask_value).type_as(attn_score) + ) + elif attn_mask.dim() == 3: + attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], mask_value).type_as(attn_score) + + # [qlen x klen x bsz x n_head] + attn_prob = nn.functional.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + # Mask heads if we want to + if head_mask is not None: + attn_prob = attn_prob * head_mask + + # compute attention vector + attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + # linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + # residual connection + outputs = [w + attn_out] + else: + # residual connection + layer normalization + outputs = [self.layer_norm(w + attn_out)] + + if output_attentions: + outputs.append(attn_prob) + + return outputs + + +class RelPartialLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs): + super().__init__() + + self.dec_attn = RelPartialLearnableMultiHeadAttn( + n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs + ) + self.pos_ff = PositionwiseFF( + d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon + ) + + def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False): + attn_outputs = self.dec_attn( + dec_inp, + r, + attn_mask=dec_attn_mask, + mems=mems, + head_mask=head_mask, + output_attentions=output_attentions, + ) + ff_output = self.pos_ff(attn_outputs[0]) + + outputs = [ff_output] + attn_outputs[1:] + + return outputs + + +class AdaptiveEmbedding(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False): + super().__init__() + + self.n_token = n_token + self.d_embed = d_embed + + self.cutoffs = cutoffs + [n_token] + self.div_val = div_val + self.d_proj = d_proj + + self.emb_scale = d_proj**0.5 + + self.cutoff_ends = [0] + self.cutoffs + + self.emb_layers = nn.ModuleList() + self.emb_projs = nn.ParameterList() + if div_val == 1: + self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0)) + if d_proj != d_embed: + self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + d_emb_i = d_embed // (div_val**i) + self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i)) + self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) + + def forward(self, inp): + if self.div_val == 1: + embed = self.emb_layers[0](inp) + if self.d_proj != self.d_embed: + embed = nn.functional.linear(embed, self.emb_projs[0]) + else: + param = next(self.parameters()) + inp_flat = inp.view(-1) + emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + + mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + inp_i = inp_flat.index_select(0, indices_i) - l_idx + emb_i = self.emb_layers[i](inp_i) + emb_i = nn.functional.linear(emb_i, self.emb_projs[i]) + + emb_flat.index_copy_(0, indices_i, emb_i) + + embed_shape = inp.size() + (self.d_proj,) + embed = emb_flat.view(embed_shape) + + embed.mul_(self.emb_scale) + + return embed + + +class TransfoXLPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: TransfoXLConfig + load_tf_weights = load_tf_weights_in_transfo_xl + base_model_prefix = "transformer" + + def _init_weight(self, weight): + if self.config.init == "uniform": + nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) + elif self.config.init == "normal": + nn.init.normal_(weight, 0.0, self.config.init_std) + + def _init_bias(self, bias): + nn.init.constant_(bias, 0.0) + + def _init_weights(self, m): + """Initialize the weights.""" + classname = m.__class__.__name__ + if classname.find("Linear") != -1: + if hasattr(m, "weight") and m.weight is not None: + self._init_weight(m.weight) + if hasattr(m, "bias") and m.bias is not None: + self._init_bias(m.bias) + elif classname.find("AdaptiveEmbedding") != -1: + if hasattr(m, "emb_projs"): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std) + elif classname.find("Embedding") != -1: + if hasattr(m, "weight"): + self._init_weight(m.weight) + elif classname.find("ProjectedAdaptiveLogSoftmax") != -1: + if hasattr(m, "cluster_weight") and m.cluster_weight is not None: + self._init_weight(m.cluster_weight) + if hasattr(m, "cluster_bias") and m.cluster_bias is not None: + self._init_bias(m.cluster_bias) + if hasattr(m, "out_projs"): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std) + elif classname.find("LayerNorm") != -1: + if hasattr(m, "weight"): + nn.init.normal_(m.weight, 1.0, self.config.init_std) + if hasattr(m, "bias") and m.bias is not None: + self._init_bias(m.bias) + else: + if hasattr(m, "r_emb"): + self._init_weight(m.r_emb) + if hasattr(m, "r_w_bias"): + self._init_weight(m.r_w_bias) + if hasattr(m, "r_r_bias"): + self._init_weight(m.r_r_bias) + if hasattr(m, "r_bias"): + self._init_bias(m.r_bias) + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1): + """ + Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying + weights embeddings afterwards if the model class has a *tie_weights()* method. + + Arguments: + new_num_tokens: (*optional*) int: + New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at + the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and + just returns a pointer to the input tokens `torch.nn.Embeddings` Module of the model. + layer: (*optional*) int: + Layer of the *AdaptiveEmbedding* where the resizing should be done. Per default the last layer will be + resized. Be aware that when resizing other than the last layer, you have to ensure that the new + token(s) in the tokenizer are at the corresponding position. + + Return: `torch.nn.Embeddings` Pointer to the input tokens Embeddings Module of the model + """ + base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed + + if new_num_tokens is None: + return self.get_input_embeddings() + + new_num_tokens_layer, layer = self._get_new_num_tokens_layer(new_num_tokens, layer) + assert new_num_tokens_layer > 0, "The size of the new embedding layer cannot be 0 or less" + model_embeds = base_model._resize_token_embeddings(new_num_tokens_layer, layer) + + # Update base model and current model config + self.config.vocab_size = new_num_tokens + base_model.vocab_size = new_num_tokens + base_model.n_token = new_num_tokens + + new_embedding_shapes = self._get_embedding_shapes() + self._resize_cutoffs(new_num_tokens, new_num_tokens_layer, new_embedding_shapes, layer) + + # Tie weights again if needed + self.tie_weights() + + return model_embeds + + def _get_new_num_tokens_layer(self, new_num_tokens, layer): + embeddings = self.get_input_embeddings() + if layer == -1: + layer = len(embeddings.emb_layers) - 1 + assert 0 <= layer <= len(embeddings.emb_layers) - 1 + + new_num_tokens_layer = ( + new_num_tokens + - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[:layer]) + - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1 :]) + ) + return new_num_tokens_layer, layer + + def _get_embedding_shapes(self): + embeddings = self.get_input_embeddings() + return [emb.weight.shape[0] for emb in embeddings.emb_layers] + + def _resize_token_embeddings(self, new_num_tokens, layer=-1): + embeddings = self.get_input_embeddings() + if new_num_tokens is None: + return embeddings + new_embeddings_layer = self._get_resized_embeddings(embeddings.emb_layers[layer], new_num_tokens) + embeddings.emb_layers[layer] = new_embeddings_layer + + self.set_input_embeddings(embeddings) + + return self.get_input_embeddings() + + def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer): + embeddings = self.get_input_embeddings() + + for i in range(layer, len(embeddings.cutoffs)): + embeddings.cutoffs[i] = sum(new_embedding_shapes[: i + 1]) + + embeddings.cutoff_ends = [0] + embeddings.cutoffs + embeddings.n_token = new_num_tokens + + self.config.cutoffs = embeddings.cutoffs[:-1] + + return embeddings.cutoffs + + +@dataclass +class TransfoXLModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + mems (`list[torch.FloatTensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` + input) to speed up sequential decoding. The token ids which have their past given to this model should not + be passed as input ids as they have already been computed. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: torch.FloatTensor + mems: Optional[list[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class TransfoXLSequenceClassifierOutputWithPast(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + mems (`list[torch.FloatTensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` + input) to speed up sequential decoding. The token ids which have their past given to this model should not + be passed as input ids as they have already been computed. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + mems: Optional[list[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class TransfoXLLMHeadModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + losses (`torch.FloatTensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided): + Language modeling losses (not reduced). + prediction_scores (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax). + mems (`list[torch.FloatTensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems` + input) to speed up sequential decoding. The token ids which have their past given to this model should not + be passed as input ids as they have already been computed. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + loss (`torch.FloatTensor` of shape `()`, *optional*, returned when `labels` is provided) + Reduced language modeling loss. + """ + + losses: Optional[torch.FloatTensor] = None + prediction_scores: Optional[torch.FloatTensor] = None + mems: Optional[list[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + loss: Optional[torch.FloatTensor] = None + + @property + def logits(self): + # prediction scores are the output of the adaptive softmax, see + # the file `modeling_transfo_xl_utilities`. Since the adaptive + # softmax returns the log softmax value, `self.prediction_scores` + # are strictly speaking not exactly `logits`, but behave the same + # way logits do. + return self.prediction_scores + + +TRANSFO_XL_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`TransfoXLConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TRANSFO_XL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + mems (`list[torch.FloatTensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see + `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems + given to this model should not be passed as `input_ids` as they have already been computed. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + TRANSFO_XL_START_DOCSTRING, +) +class TransfoXLModel(TransfoXLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.n_token = config.vocab_size + + self.d_embed = config.d_embed + self.d_model = config.d_model + self.n_head = config.n_head + self.d_head = config.d_head + + self.word_emb = AdaptiveEmbedding( + config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val + ) + + self.drop = nn.Dropout(config.dropout) + + self.n_layer = config.n_layer + self.mem_len = config.mem_len + self.attn_type = config.attn_type + + if not config.untie_r: + self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + + self.layers = nn.ModuleList() + if config.attn_type == 0: # the default attention + for i in range(config.n_layer): + self.layers.append( + RelPartialLearnableDecoderLayer( + config.n_head, + config.d_model, + config.d_head, + config.d_inner, + config.dropout, + dropatt=config.dropatt, + pre_lnorm=config.pre_lnorm, + r_w_bias=None if config.untie_r else self.r_w_bias, + r_r_bias=None if config.untie_r else self.r_r_bias, + layer_norm_epsilon=config.layer_norm_epsilon, + ) + ) + else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints + raise NotImplementedError # Removed them to avoid maintaining dead code + + self.same_length = config.same_length + self.clamp_len = config.clamp_len + + if self.attn_type == 0: # default attention + self.pos_emb = PositionalEmbedding(self.d_model) + else: # learnable embeddings and absolute embeddings + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.word_emb + + def set_input_embeddings(self, new_embeddings): + self.word_emb = new_embeddings + + def backward_compatible(self): + self.sample_softmax = -1 + + def reset_memory_length(self, mem_len): + self.mem_len = mem_len + + def _prune_heads(self, heads): + logger.info("Head pruning is not implemented for Transformer-XL model") + pass + + def init_mems(self, bsz): + if self.mem_len > 0: + mems = [] + param = next(self.parameters()) + for i in range(self.n_layer): + empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device) + mems.append(empty) + + return mems + else: + return None + + def _update_mems(self, hids, mems, mlen, qlen): + # does not deal with None + if mems is None: + return None + + # mems is not None + assert len(hids) == len(mems), "len(hids) != len(mems)" + + # There are `mlen + qlen` steps that can be cached into mems + with torch.no_grad(): + new_mems = [] + end_idx = mlen + max(0, qlen) + beg_idx = max(0, end_idx - self.mem_len) + for i in range(len(hids)): + cat = torch.cat([mems[i], hids[i]], dim=0) + new_mems.append(cat[beg_idx:end_idx].detach()) + + return new_mems + + @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TransfoXLModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + mems: Optional[list[torch.FloatTensor]] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TransfoXLModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library + # so we transpose here from shape [bsz, len] to shape [len, bsz] + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_ids = input_ids.transpose(0, 1).contiguous() + qlen, bsz = input_ids.size() + elif inputs_embeds is not None: + inputs_embeds = inputs_embeds.transpose(0, 1).contiguous() + qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if mems is None: + mems = self.init_mems(bsz) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) + # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0) + head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to float if need + fp16 compatibility + else: + head_mask = [None] * self.n_layer + + if inputs_embeds is not None: + word_emb = inputs_embeds + else: + word_emb = self.word_emb(input_ids) + + mlen = mems[0].size(0) if mems is not None else 0 + klen = mlen + qlen + if self.same_length: + all_ones = word_emb.new_ones((qlen, klen), dtype=torch.bool) + mask_len = klen - self.mem_len + if mask_len > 0: + mask_shift_len = qlen - mask_len + else: + mask_shift_len = qlen + dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 + else: + dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.bool), diagonal=1 + mlen)[ + :, :, None + ] + + hids = [] + attentions = [] if output_attentions else None + if self.attn_type == 0: # default + pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=torch.int64).type_as( + dtype=word_emb.dtype + ) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb) + pos_emb = self.drop(pos_emb) + + for i, layer in enumerate(self.layers): + hids.append(core_out) + mems_i = None if mems is None else mems[i] + layer_outputs = layer( + core_out, + pos_emb, + dec_attn_mask=dec_attn_mask, + mems=mems_i, + head_mask=head_mask[i], + output_attentions=output_attentions, + ) + core_out = layer_outputs[0] + if output_attentions: + attentions.append(layer_outputs[1]) + else: # learnable embeddings and absolute embeddings + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + + core_out = self.drop(core_out) + + new_mems = self._update_mems(hids, mems, mlen, qlen) + + if output_hidden_states: + # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] + hids.append(core_out) + hids = tuple(t.transpose(0, 1).contiguous() for t in hids) + else: + hids = None + if output_attentions: + # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] + attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) + # We transpose back here to shape [bsz, len, hidden_dim] + core_out = core_out.transpose(0, 1).contiguous() + + if not return_dict: + return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) + + return TransfoXLModelOutput( + last_hidden_state=core_out, + mems=new_mems, + hidden_states=hids, + attentions=attentions, + ) + + +@add_start_docstrings( + """ + The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive + input embeddings) + """, + TRANSFO_XL_START_DOCSTRING, +) +class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): + _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = TransfoXLModel(config) + self.sample_softmax = config.sample_softmax + self.trainer_compatible = getattr(config, "trainer_compatible", False) + + if not self.trainer_compatible: + warnings.warn( + "The output of TransfoXL will be updated in v5 to support a single loss as first argument. In order " + "to use that updated output, please specify `trainer_compatible=True` as your configuration" + " attribute.", + DeprecationWarning, + ) + + assert self.sample_softmax <= 0, ( + "Sampling from the softmax is not implemented yet. Please look at issue: #3310:" + " https://github.com/huggingface/transformers/issues/3310" + ) + + self.crit = ProjectedAdaptiveLogSoftmax( + config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val + ) + + # Initialize weights and apply final processing + self.post_init() + + def tie_weights(self): + """ + Run this to be sure output and input (adaptive) softmax weights are tied + """ + + if self.config.tie_word_embeddings: + for i in range(len(self.crit.out_layers)): + self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i]) + if self.config.tie_projs: + for i, tie_proj in enumerate(self.config.tie_projs): + if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: + if self.config.torchscript: + self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone()) + else: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] + elif tie_proj and self.config.div_val != 1: + if self.config.torchscript: + self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone()) + else: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] + + def reset_memory_length(self, mem_len): + self.transformer.reset_memory_length(mem_len) + + def init_mems(self, bsz): + return self.transformer.init_mems(bsz) + + @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TransfoXLLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + mems: Optional[list[torch.FloatTensor]] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TransfoXLLMHeadModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if input_ids is not None: + bsz, tgt_len = input_ids.size(0), input_ids.size(1) + elif inputs_embeds is not None: + bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1) + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + transformer_outputs = self.transformer( + input_ids, + mems=mems, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden = transformer_outputs[0] + pred_hid = last_hidden[:, -tgt_len:] + + if labels is not None: + # Prevents all labels being -100 and throwing an error + # when backwarding the loss + miss_valid_label = labels[0, 1:].sum() == (labels.size(1) - 1) * -100 + if miss_valid_label: + # Sets an token, just to prevent loss from being NaN + labels[0, 1] = self.config.eos_token_id + + softmax_output = self.crit(pred_hid, labels) + prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else () + + if labels is not None: + losses = softmax_output.view(bsz, tgt_len - 1) + # Avoids from incorporating padding (-100) tokens into loss value + loss = losses[losses != 0].mean() + else: + losses, loss = None, None + + if not return_dict: + if self.trainer_compatible: + output = (prediction_scores, losses) if losses is not None else (prediction_scores,) + output += transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + else: + output = (prediction_scores, *transformer_outputs[1:]) + output = ((losses,) + output) if losses is not None else output + return (output + (loss,)) if loss is not None else output + + return TransfoXLLMHeadModelOutput( + loss=loss, + prediction_scores=prediction_scores, + losses=losses, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def get_output_embeddings(self): + """Double-check if you are using adaptive softmax.""" + if self.sample_softmax > 0: + return self.out_layer + else: + return self.crit.out_layers[-1] + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs): + inputs = {} + + # if past is defined in model kwargs then use it for faster decoding + if past_key_values: + inputs["mems"] = past_key_values + inputs["input_ids"] = input_ids[:, -1].unsqueeze(-1) + else: + inputs["input_ids"] = input_ids + + return inputs + + def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer): + new_cutoffs = super()._resize_cutoffs(new_num_tokens, new_emb_size, new_embedding_shapes, layer) + + self.crit.cutoffs = new_cutoffs + self.crit.cutoff_ends = [0] + new_cutoffs + self.crit.n_token = new_num_tokens + + @staticmethod + def _reorder_cache(mems: list[torch.Tensor], beam_idx: torch.Tensor) -> list[torch.Tensor]: + """ + This function is used to re-order the `mems` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `mems` with the correct beam_idx at every + generation step. + """ + return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems] + + +@add_start_docstrings( + """ + The Transformer-XL Model transformer with a sequence classification head on top (linear layer). + + [`TransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal + models (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + TRANSFO_XL_START_DOCSTRING, +) +class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = TransfoXLModel(config) + self.score = nn.Linear(config.d_embed, self.num_labels, bias=False) + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TransfoXLSequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + mems: Optional[list[torch.FloatTensor]] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TransfoXLSequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + mems=mems, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + assert self.config.pad_token_id is not None or batch_size == 1, ( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[range(batch_size), sequence_lengths] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TransfoXLSequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = [ + "AdaptiveEmbedding", + "TransfoXLForSequenceClassification", + "TransfoXLLMHeadModel", + "TransfoXLModel", + "TransfoXLPreTrainedModel", + "load_tf_weights_in_transfo_xl", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..f76f3ccc6259fcb033b44eb43dd98be23482221c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py @@ -0,0 +1,251 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl. +""" + +import torch +from torch import nn + + +# CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) +# CUDA_MINOR = int(torch.version.cuda.split('.')[1]) + + +class ProjectedAdaptiveLogSoftmax(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False): + super().__init__() + + self.n_token = n_token + self.d_embed = d_embed + self.d_proj = d_proj + + self.cutoffs = cutoffs + [n_token] + self.cutoff_ends = [0] + self.cutoffs + self.div_val = div_val + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + if self.n_clusters > 0: + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.out_layers = nn.ModuleList() + self.out_projs = nn.ParameterList() + + if div_val == 1: + for i in range(len(self.cutoffs)): + if d_proj != d_embed: + self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) + else: + self.out_projs.append(None) + + self.out_layers.append(nn.Linear(d_embed, n_token)) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + d_emb_i = d_embed // (div_val**i) + + self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) + + self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx)) + + self.keep_order = keep_order + + def _compute_logit(self, hidden, weight, bias, proj): + if proj is None: + logit = nn.functional.linear(hidden, weight, bias=bias) + else: + # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: + proj_hid = nn.functional.linear(hidden, proj.t().contiguous()) + logit = nn.functional.linear(proj_hid, weight, bias=bias) + # else: + # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) + # if bias is not None: + # logit = logit + bias + + return logit + + def forward(self, hidden, labels=None, keep_order=False): + """ + Params: + hidden :: [len*bsz x d_proj] + labels :: [len*bsz] + + Return: + if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out :: + [(len-1)*bsz] Negative log likelihood. We could replace this implementation by the native PyTorch one if + theirs had an option to set bias on all clusters in the native one. here: + https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138 + """ + + if labels is not None: + # Shift so that tokens < n predict n + hidden = hidden[..., :-1, :].contiguous() + labels = labels[..., 1:].contiguous() + hidden = hidden.view(-1, hidden.size(-1)) + labels = labels.view(-1) + if hidden.size(0) != labels.size(0): + raise RuntimeError("Input and labels should have the same size in the batch dimension.") + else: + hidden = hidden.view(-1, hidden.size(-1)) + + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) + if labels is not None: + mask = labels != -100 + out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device) + out[mask] = ( + -nn.functional.log_softmax(logit, dim=-1)[mask].gather(1, labels[mask].unsqueeze(1)).squeeze(1) + ) + else: + out = nn.functional.log_softmax(logit, dim=-1) + else: + # construct weights and biases + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + head_logprob = nn.functional.log_softmax(head_logit, dim=1) + + if labels is None: + out = hidden.new_empty((head_logit.size(0), self.n_token)) + else: + out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] + + if labels is not None: + mask_i = (labels >= l_idx) & (labels < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = labels.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + hidden_i = hidden.index_select(0, indices_i) + else: + hidden_i = hidden + + if i == 0: + if labels is not None: + logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1) + else: + out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) + tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1) + cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster + if labels is not None: + logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather( + 1, target_i[:, None] + ).squeeze(1) + else: + logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i + out[:, l_idx:r_idx] = logprob_i + + if labels is not None: + if (hasattr(self, "keep_order") and self.keep_order) or keep_order: + out.index_copy_(0, indices_i, -logprob_i) + else: + out[offset : offset + logprob_i.size(0)].copy_(-logprob_i) + offset += logprob_i.size(0) + + return out + + def log_prob(self, hidden): + r""" + Computes log probabilities for all \\(n\_classes\\) From: + https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p + + Args: + hidden (Tensor): a minibatch of example + + Returns: + log-probabilities of for each class \\(c\\) in range \\(0 <= c <= n\_classes\\), where \\(n\_classes\\) is + a parameter passed to `AdaptiveLogSoftmaxWithLoss` constructor. Shape: + + - Input: \\((N, in\_features)\\) + - Output: \\((N, n\_classes)\\) + """ + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) + return nn.functional.log_softmax(logit, dim=-1) + else: + # construct weights and biases + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + + out = hidden.new_empty((head_logit.size(0), self.n_token)) + head_logprob = nn.functional.log_softmax(head_logit, dim=1) + + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1] + + if i == 0: + out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i) + tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1) + + logprob_i = head_logprob[:, -i] + tail_logprob_i + out[:, start_idx, stop_idx] = logprob_i + + return out diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py new file mode 100644 index 0000000000000000000000000000000000000000..70e2da0185563d4115748266474a935ffddef059 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py @@ -0,0 +1,825 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. +""" + +import glob +import os +import pickle +import re +from collections import Counter, OrderedDict +from typing import Optional + +import numpy as np + +from ....tokenization_utils import PreTrainedTokenizer +from ....utils import ( + cached_file, + check_torch_load_is_safe, + is_sacremoses_available, + is_torch_available, + logging, + requires_backends, + strtobool, + torch_only_method, +) + + +if is_sacremoses_available(): + import sacremoses as sm + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "pretrained_vocab_file": "vocab.pkl", + "pretrained_vocab_file_torch": "vocab.bin", + "vocab_file": "vocab.txt", +} + + +PRETRAINED_CORPUS_ARCHIVE_MAP = { + "transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/corpus.bin", +} +CORPUS_NAME = "corpus.bin" + +MATCH_NUMBERS = r"(?<=\d)[,.](?=\d)", r" @\g<0>@ " +DETOKENIZE_NUMBERS = [(r" @\,@ ", r","), (r" @\.@ ", r".")] + + +def tokenize_numbers(text_array: list[str]) -> list[str]: + """ + Splits large comma-separated numbers and floating point values. This is done by replacing commas with ' @,@ ' and + dots with ' @.@ '. + + Args: + text_array: An already tokenized text as list. + + Returns: + A list of strings with tokenized numbers. + + Example: + + ```python + >>> tokenize_numbers(["$", "5,000", "1.73", "m"]) + ['$', '5', '@,@', '000', '1', '@.@', '73', 'm'] + ```""" + tokenized = [] + for i in range(len(text_array)): + reg, sub = MATCH_NUMBERS + replaced = re.sub(reg, sub, text_array[i]).split() + tokenized.extend(replaced) + + return tokenized + + +def detokenize_numbers(text: str) -> str: + """ + Inverts the operation of *tokenize_numbers*. This is replacing ' @,@ ' and ' @.@' by ',' and '.'. + + Args: + text: A string where the number should be detokenized. + + Returns: + A detokenized string. + + Example: + + ```python + >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m") + '$ 5,000 1.73 m' + ```""" + for reg, sub in DETOKENIZE_NUMBERS: + text = re.sub(reg, sub, text) + return text + + +class TransfoXLTokenizer(PreTrainedTokenizer): + """ + Construct a Transformer-XL tokenizer adapted from Vocab class in [the original + code](https://github.com/kimiyoung/transformer-xl). The Transformer-XL tokenizer is a word-level tokenizer (no + sub-word tokenization). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + special (`list[str]`, *optional*): + A list of special tokens (to be treated by the original implementation of this tokenizer). + min_freq (`int`, *optional*, defaults to 0): + The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it + will be mapped to `unk_token`). + max_size (`int`, *optional*): + The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found + after excluding the tokens according to the `min_freq` rule. + lower_case (`bool`, *optional*, defaults to `False`): + Whether or not to lowercase the input when tokenizing. + delimiter (`str`, *optional*): + The delimiter used between tokens. + vocab_file (`str`, *optional*): + File containing the vocabulary (from the original implementation). + pretrained_vocab_file (`str`, *optional*): + File containing the vocabulary as saved with the `save_pretrained()` method. + never_split (`list[str]`, *optional*): + List of tokens that should never be split. If no list is specified, will simply use the existing special + tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + additional_special_tokens (`list[str]`, *optional*, defaults to `['']`): + A list of additional special tokens (for the HuggingFace functionality). + language (`str`, *optional*, defaults to `"en"`): + The language of this tokenizer (used for mose preprocessing). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids"] + + def __init__( + self, + special=None, + min_freq=0, + max_size=None, + lower_case=False, + delimiter=None, + vocab_file=None, + pretrained_vocab_file: Optional[str] = None, + never_split=None, + unk_token="", + eos_token="", + additional_special_tokens=[""], + language="en", + **kwargs, + ): + logger.error( + "`TransfoXL` was deprecated due to security issues linked to `pickle.load` in `TransfoXLTokenizer`. " + "See more details on this model's documentation page: " + "`https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/transfo-xl.md`." + ) + + requires_backends(self, "sacremoses") + if special is None: + special = [] + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~' + self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]") + self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern() + self.language = language + self.moses_punct_normalizer = sm.MosesPunctNormalizer(language) + self.moses_tokenizer = sm.MosesTokenizer(language) + self.moses_detokenizer = sm.MosesDetokenizer(language) + self.idx2sym = [] + self.sym2idx = OrderedDict() + # This try... catch... is not beautiful but honestly this tokenizer was not made to be used + # in a library like ours, at all. + try: + vocab_dict = None + if pretrained_vocab_file is not None: + # Priority on pickle files (support PyTorch and TF) + if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")): + raise ValueError( + "This part uses `pickle.load` which is insecure and will execute arbitrary code that is " + "potentially malicious. It's recommended to never unpickle data that could have come from an " + "untrusted source, or that could have been tampered with. If you already verified the pickle " + "data and decided to use it, you can set the environment variable " + "`TRUST_REMOTE_CODE` to `True` to allow it." + ) + with open(pretrained_vocab_file, "rb") as f: + vocab_dict = pickle.load(f) + + # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer + # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed. + # We therefore load it with torch, if it's available. + if isinstance(vocab_dict, int): + if not is_torch_available(): + raise ImportError( + "Not trying to load dict with PyTorch as you need to install pytorch to load " + "from a PyTorch pretrained vocabulary, " + "or activate it with environment variables USE_TORCH=1 and USE_TF=0." + ) + check_torch_load_is_safe() + vocab_dict = torch.load(pretrained_vocab_file, weights_only=True) + + if vocab_dict is not None: + for key, value in vocab_dict.items(): + if key not in self.__dict__ or key in ["sym2idx", "idx2sym"]: + self.__dict__[key] = value + elif vocab_file is not None: + self.build_vocab() + + except Exception as e: + raise ValueError( + f"Unable to parse file {pretrained_vocab_file}. Unknown format. " + "If you tried to load a model saved through TransfoXLTokenizerFast, " + "please note they are not compatible." + ) from e + + if vocab_file is not None: + self.build_vocab() + + super().__init__( + special=special, + min_freq=min_freq, + max_size=max_size, + lower_case=lower_case, + delimiter=delimiter, + vocab_file=vocab_file, + pretrained_vocab_file=pretrained_vocab_file, + never_split=never_split, + unk_token=unk_token, + eos_token=eos_token, + additional_special_tokens=additional_special_tokens, + language=language, + **kwargs, + ) + + # these are not required to initialize the parent class as only used when tokenizing. + if never_split is None: + never_split = self.all_special_tokens + self.never_split = never_split + + @property + def do_lower_case(self): + return self.lower_case + + def _compile_space_around_punctuation_pattern(self): + look_ahead_for_special_token = f"(?=[{self.punctuation_symbols}])" + look_ahead_to_match_all_except_space = r"(?=[^\s])" + return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space) + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: + logger.info(f"counting file {path} ...") + assert os.path.exists(path), f"Input file {path} not found" + + sents = [] + with open(path, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(f" line {idx}") + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: + logger.info(f"counting {len(sents)} sents ...") + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(f" line {idx}") + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, "r", encoding="utf-8") as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + if "" in self.sym2idx: + self.unk_idx = self.sym2idx[""] + elif "" in self.sym2idx: + self.unk_idx = self.sym2idx[""] + else: + raise ValueError("Token not in vocabulary and no token in vocabulary for replacement.") + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["pretrained_vocab_file"], + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "wb") as f: + pickle.dump(self.__dict__, f) + return (vocab_file,) + + def build_vocab(self): + if self.vocab_file: + logger.info(f"building vocab from {self.vocab_file}") + self._build_from_file(self.vocab_file) + logger.info(f"Final vocab size {len(self.sym2idx)}") + else: + logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}") + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: + break + self.add_symbol(sym) + + logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens") + + @torch_only_method + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False): + if verbose: + logger.info(f"encoding file {path} ...") + assert os.path.exists(path), f"Output file {path} not found" + encoded = [] + with open(path, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(f" line {idx}") + symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + @torch_only_method + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: + logger.info(f"encoding {len(sents)} sents ...") + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(f" line {idx}") + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, f"{sym.strip('<>')}_idx", self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def move_added_token(self, token: str, target_idx: int): + """ + Moves an added token to a specific position in the vocab. This method should be used when resizing an embedding + layer other than the last one in the `AdaptiveEmbedding` in order to move the token in the tokenizer from the + default position (at the very end) to the desired one. + + Args: + token: The token to move to a specific position in the vocab. + target_idx: The position where the token should be moved to. + """ + assert token in self.added_tokens_encoder, "Token which should be moved has to be an added token" + assert token not in self.idx2sym, "Token which should be moved is already in vocab" + + # Insert sym into vocab + self.idx2sym.insert(target_idx, token) + self.sym2idx[token] = target_idx + + # Shift following indices in sym2idx + for idx in range(target_idx + 1, len(self.idx2sym)): + current_sym = self.idx2sym[idx] + self.sym2idx[current_sym] = idx + + # Delete token from added_tokens + old_index = self._added_tokens_encoder.pop(token) + self._added_tokens_decoder.pop(old_index) + + def moses_punct_norm(self, text): + return self.moses_punct_normalizer.normalize(text) + + def moses_tokenize(self, text): + return self.moses_tokenizer.tokenize( + text, aggressive_dash_splits=True, return_str=False, escape=False, protected_patterns=self.never_split + ) + + def moses_pipeline(self, text: str) -> list[str]: + """ + Does basic tokenization using [`sacremoses.MosesPunctNormalizer`] and [`sacremoses.MosesTokenizer`] with + *aggressive_dash_splits=True* (see [`sacremoses.tokenize.MosesTokenizer.tokenize`]). Additionally, large + comma-separated numbers and floating point values are split. E.g. "23,000 people are 1.80m tall" -> "23 @,@ 000 + people are 1 @.@ 80m tall" + + Args: + text: Text to be tokenize + + Returns: + A list of tokenized string + + Example: + + ```python + >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl/transfo-xl-wt103") + >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall") + ['23', '@,@', '000', 'people', 'are', '1', '@.@', '80', 'm', 'tall'] + ```""" + text = self.moses_punct_norm(text) + text = self.moses_tokenize(text) + text = tokenize_numbers(text) + return text + + def _convert_id_to_token(self, idx): + """Converts an id in a token (BPE) using the vocab.""" + assert 0 <= idx < len(self), f"Index {idx} out of vocabulary range" + return self.idx2sym[idx] + + def _convert_token_to_id(self, sym): + """Converts a token (str) in an id using the vocab.""" + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + # logger.info(f'encounter unk {sym}') + # assert '' not in sym + if hasattr(self, "unk_idx"): + return self.sym2idx.get(sym, self.unk_idx) + # Backward compatibility with pre-trained models + elif "" in self.sym2idx: + return self.sym2idx[""] + elif "" in self.sym2idx: + return self.sym2idx[""] + else: + raise ValueError("Token not in vocabulary and no token in vocabulary for replacement.") + + def convert_tokens_to_string(self, tokens): + """ + Converts a sequence of tokens (string) in a single string. Additionally, the split numbers are converted back + into it's original form. + """ + out_string = self.moses_detokenizer.detokenize(tokens) + return detokenize_numbers(out_string).strip() + + @torch_only_method + def convert_to_tensor(self, symbols): + return torch.LongTensor(self.convert_tokens_to_ids(symbols)) + + @property + def vocab_size(self): + return len(self.idx2sym) + + def get_vocab(self): + vocab = self.sym2idx.copy() + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + # convert to lower case + if self.lower_case: + line = line.lower() + + # empty delimiter '' will evaluate False + if self.delimiter == "": + symbols = line + else: + symbols = self.moses_pipeline(line) + + if add_double_eos: # lm1b + return [""] + symbols + [""] + elif add_eos: + return symbols + [""] + else: + return symbols + + +class LMOrderedIterator: + def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): + """ + data -- LongTensor -- the LongTensor is strictly ordered + """ + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + + # Work out how cleanly we can divide the dataset into bsz parts. + self.n_step = data.size(0) // bsz + + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, self.n_step * bsz) + + # Evenly divide the data across the bsz batches. + self.data = data.view(bsz, -1).t().contiguous().to(device) + + # Number of mini-batches + self.n_batch = (self.n_step + self.bptt - 1) // self.bptt + + def get_batch(self, i, bptt=None): + if bptt is None: + bptt = self.bptt + seq_len = min(bptt, self.data.size(0) - 1 - i) + + end_idx = i + seq_len + beg_idx = max(0, i - self.ext_len) + + data = self.data[beg_idx:end_idx] + target = self.data[i + 1 : i + 1 + seq_len] + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + return data_out, target_out, seq_len + + def get_fixlen_iter(self, start=0): + for i in range(start, self.data.size(0) - 1, self.bptt): + yield self.get_batch(i) + + def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): + max_len = self.bptt + max_deviation * std + i = start + while True: + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0 + bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) + data, target, seq_len = self.get_batch(i, bptt) + i += seq_len + yield data, target, seq_len + if i >= self.data.size(0) - 2: + break + + def __iter__(self): + return self.get_fixlen_iter() + + +class LMShuffledIterator: + def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False): + """ + data -- list[LongTensor] -- there is no order among the LongTensors + """ + self.data = data + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self): + # index iterator + epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data))) + + # sentence iterator + for idx in epoch_indices: + yield self.data[idx] + + @torch_only_method + def stream_iterator(self, sent_stream): + # streams for each data in the batch + streams = [None] * self.bsz + + data = torch.LongTensor(self.bptt, self.bsz) + target = torch.LongTensor(self.bptt, self.bsz) + + n_retain = 0 + + while True: + # data : [n_retain+bptt x bsz] + # target : [bptt x bsz] + data[n_retain:].fill_(-1) + target.fill_(-1) + + valid_batch = True + + for i in range(self.bsz): + n_filled = 0 + try: + while n_filled < self.bptt: + if streams[i] is None or len(streams[i]) <= 1: + streams[i] = next(sent_stream) + # number of new tokens to fill in + n_new = min(len(streams[i]) - 1, self.bptt - n_filled) + # first n_retain tokens are retained from last batch + data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new] + target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1] + streams[i] = streams[i][n_new:] + n_filled += n_new + except StopIteration: + valid_batch = False + break + + if not valid_batch: + return + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + yield data_out, target_out, self.bptt + + n_retain = min(data.size(0), self.ext_len) + if n_retain > 0: + data[:n_retain] = data[-n_retain:] + data.resize_(n_retain + self.bptt, data.size(1)) + + def __iter__(self): + # sent_stream is an iterator + sent_stream = self.get_sent_stream() + + for batch in self.stream_iterator(sent_stream): + yield batch + + +class LMMultiFileIterator(LMShuffledIterator): + def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False): + self.paths = paths + self.vocab = vocab + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self, path): + sents = self.vocab.encode_file(path, add_double_eos=True) + if self.shuffle: + np.random.shuffle(sents) + sent_stream = iter(sents) + + return sent_stream + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.paths) + + for path in self.paths: + # sent_stream is an iterator + sent_stream = self.get_sent_stream(path) + for batch in self.stream_iterator(sent_stream): + yield batch + + +class TransfoXLCorpus: + @classmethod + @torch_only_method + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a pre-processed corpus. + """ + vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + is_local = os.path.isdir(pretrained_model_name_or_path) + # redirect to the cache, if necessary + try: + resolved_corpus_file = cached_file(pretrained_model_name_or_path, CORPUS_NAME, cache_dir=cache_dir) + except OSError: + logger.error( + f"Corpus '{pretrained_model_name_or_path}' was not found in corpus list" + f" ({', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys())}. We assumed '{pretrained_model_name_or_path}'" + f" was a path or url but couldn't find files {CORPUS_NAME} at this path or url." + ) + return None + if is_local: + logger.info(f"loading corpus file {resolved_corpus_file}") + else: + logger.info(f"loading corpus file {CORPUS_NAME} from cache at {resolved_corpus_file}") + + # Instantiate tokenizer. + corpus = cls(*inputs, **kwargs) + check_torch_load_is_safe() + corpus_dict = torch.load(resolved_corpus_file, weights_only=True) + for key, value in corpus_dict.items(): + corpus.__dict__[key] = value + corpus.vocab = vocab + if corpus.train is not None: + corpus.train = torch.tensor(corpus.train, dtype=torch.long) + if corpus.valid is not None: + corpus.valid = torch.tensor(corpus.valid, dtype=torch.long) + if corpus.test is not None: + corpus.test = torch.tensor(corpus.test, dtype=torch.long) + return corpus + + def __init__(self, *args, **kwargs): + self.vocab = TransfoXLTokenizer(*args, **kwargs) + self.dataset = None + self.train = None + self.valid = None + self.test = None + + def build_corpus(self, path, dataset): + self.dataset = dataset + + if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: + self.vocab.count_file(os.path.join(path, "train.txt")) + self.vocab.count_file(os.path.join(path, "valid.txt")) + self.vocab.count_file(os.path.join(path, "test.txt")) + elif self.dataset == "wt103": + self.vocab.count_file(os.path.join(path, "train.txt")) + elif self.dataset == "lm1b": + train_path_pattern = os.path.join( + path, + "1-billion-word-language-modeling-benchmark-r13output", + "training-monolingual.tokenized.shuffled", + "news.en-*", + ) + train_paths = glob.glob(train_path_pattern) + # the vocab will load from file when build_vocab() is called + + self.vocab.build_vocab() + + if self.dataset in ["ptb", "wt2", "wt103"]: + self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) + self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) + self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) + elif self.dataset in ["enwik8", "text8"]: + self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False) + self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) + elif self.dataset == "lm1b": + self.train = train_paths + self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True) + self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True) + + def get_iterator(self, split, *args, **kwargs): + if split == "train": + if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: + data_iter = LMOrderedIterator(self.train, *args, **kwargs) + elif self.dataset == "lm1b": + kwargs["shuffle"] = True + data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) + elif split in ["valid", "test"]: + data = self.valid if split == "valid" else self.test + if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: + data_iter = LMOrderedIterator(data, *args, **kwargs) + elif self.dataset == "lm1b": + data_iter = LMShuffledIterator(data, *args, **kwargs) + else: + data_iter = None + raise ValueError(f"Split not recognized: {split}") + + return data_iter + + +@torch_only_method +def get_lm_corpus(datadir, dataset): + fn = os.path.join(datadir, "cache.pt") + fn_pickle = os.path.join(datadir, "cache.pkl") + if os.path.exists(fn): + logger.info("Loading cached dataset...") + check_torch_load_is_safe() + corpus = torch.load(fn_pickle, weights_only=True) + elif os.path.exists(fn): + logger.info("Loading cached dataset from pickle...") + if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")): + raise ValueError( + "This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially " + "malicious. It's recommended to never unpickle data that could have come from an untrusted source, or " + "that could have been tampered with. If you already verified the pickle data and decided to use it, " + "you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it." + ) + with open(fn, "rb") as fp: + corpus = pickle.load(fp) + else: + logger.info(f"Producing dataset {dataset}...") + kwargs = {} + if dataset in ["wt103", "wt2"]: + kwargs["special"] = [""] + kwargs["lower_case"] = False + elif dataset == "ptb": + kwargs["special"] = [""] + kwargs["lower_case"] = True + elif dataset == "lm1b": + kwargs["special"] = [] + kwargs["lower_case"] = False + kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt") + elif dataset in ["enwik8", "text8"]: + pass + + corpus = TransfoXLCorpus(datadir, dataset, **kwargs) + torch.save(corpus, fn) + + return corpus + + +__all__ = ["TransfoXLCorpus", "TransfoXLTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..941db2f6ac5fefe66e06d598e1adbe0db1cf1769 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__init__.py @@ -0,0 +1,20 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_tvlt import * + from .feature_extraction_tvlt import * + from .processing_tvlt import * + from .modeling_tvlt import * + from .image_processing_tvlt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py new file mode 100644 index 0000000000000000000000000000000000000000..57144e349f5bcb244b4f9eeeefebd0312c4770aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py @@ -0,0 +1,187 @@ +# coding=utf-8 +# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TVLT model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class TvltConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the TVLT + [ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + spectrogram_length (`int`, *optional*, defaults to 2048): + The time length of each audio spectrogram. + frequency_length (`int`, *optional*, defaults to 128): + The frequency length of audio spectrogram. + image_patch_size (`list[int]`, *optional*, defaults to `[16, 16]`): + The size (resolution) of each image patch. + audio_patch_size (`list[int]`, *optional*, defaults to `[16, 16]`): + The size (resolution) of each audio patch. + num_image_channels (`int`, *optional*, defaults to 3): + The number of input image channels. + num_audio_channels (`int`, *optional*, defaults to 1): + The number of input audio channels. + num_frames (`int`, *optional*, defaults to 8): + The maximum number of frames for an input video. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + use_mean_pooling (`bool`, *optional*, defaults to `False`): + Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token. + decoder_num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the decoder. + decoder_hidden_size (`int`, *optional*, defaults to 512): + Dimensionality of the decoder. + decoder_num_hidden_layers (`int`, *optional*, defaults to 8): + Number of hidden layers in the decoder. + decoder_intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder. + pixel_mask_ratio (`float`, *optional*, defaults to 0.75): + Image patch masking ratio. + audio_mask_ratio (`float`, *optional*, defaults to 0.15): + Audio patch masking ratio. + audio_mask_type (`str`, *optional*, defaults to `"frame-level"`): + Audio patch masking type, choose between "frame-level" and "patch-level". + task_matching (`bool`, *optional*, defaults to `True`): + Whether to use vision audio matching task in pretraining. + task_mae (`bool`, *optional*, defaults to `True`): + Whether to use the masked auto-encoder (MAE) in pretraining. + loss_type (`str`, *optional*, defaults to `"classification"`): + Loss types including regression and classification. + + Example: + + ```python + >>> from transformers import TvltConfig, TvltModel + + >>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration + >>> configuration = TvltConfig() + + >>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration + >>> model = TvltModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "tvlt" + + def __init__( + self, + image_size=224, + spectrogram_length=2048, + frequency_length=128, + image_patch_size=[16, 16], + audio_patch_size=[16, 16], + num_image_channels=3, + num_audio_channels=1, + num_frames=8, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + qkv_bias=True, + use_mean_pooling=False, + decoder_num_attention_heads=16, + decoder_hidden_size=512, + decoder_num_hidden_layers=8, + decoder_intermediate_size=2048, + pixel_mask_ratio=0.75, + audio_mask_ratio=0.15, + audio_mask_type="frame-level", + task_matching=True, + task_mae=True, + loss_type="classification", + **kwargs, + ): + super().__init__(**kwargs) + + if audio_mask_type not in ("frame-level", "patch_level"): + raise ValueError( + "audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') " + f"got {audio_mask_type}" + ) + + self.image_size = image_size + self.spectrogram_length = spectrogram_length + self.frequency_length = frequency_length + self.image_patch_size = image_patch_size + self.audio_patch_size = audio_patch_size + self.num_image_channels = num_image_channels + self.num_audio_channels = num_audio_channels + self.num_frames = num_frames + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.qkv_bias = qkv_bias + self.use_mean_pooling = use_mean_pooling + + self.decoder_num_attention_heads = decoder_num_attention_heads + self.decoder_hidden_size = decoder_hidden_size + self.decoder_num_hidden_layers = decoder_num_hidden_layers + self.decoder_intermediate_size = decoder_intermediate_size + self.pixel_mask_ratio = pixel_mask_ratio + self.audio_mask_ratio = audio_mask_ratio + self.audio_mask_type = audio_mask_type + + self.task_matching = task_matching + self.task_mae = task_mae + self.loss_type = loss_type + + +__all__ = ["TvltConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py new file mode 100644 index 0000000000000000000000000000000000000000..b9350d31a0197926ba96e04dacfdb7bc71adbfc6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py @@ -0,0 +1,233 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for TVLT.""" + +from math import ceil +from typing import Optional, Union + +import numpy as np + +from ....audio_utils import mel_filter_bank, spectrogram, window_function +from ....feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor +from ....utils import TensorType, logging + + +logger = logging.get_logger(__name__) + + +class TvltFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a TVLT audio feature extractor. This feature extractor can be used to prepare audios for the model. + + This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users + should refer to this superclass for more information regarding those methods. + + Args: + spectrogram_length (`dict[str, int]` *optional*, defaults to 2048): + The time length of each audio spectrogram. + num_channels (`int` *optional*, defaults to 1): + Number of audio channels. + patch_size (`list[int]` *optional*, defaults to `[16, 16]`): + The patch size of audio patch embedding. + feature_size (`int`, *optional*, defaults to 128): + The frequency length of audio spectrogram. + sampling_rate (`int`, *optional*, defaults to 44100): + The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz). + hop_length_to_sampling_rate (`int`, *optional*, defaults to 86): + Hop length is length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients. + For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86 + n_fft (`int`, *optional*, defaults to 2048): + Size of the Fourier transform. + padding_value (`float`, *optional*, defaults to 0.0): + Padding value used to pad the audio. Should correspond to silences. + """ + + model_input_names = ["audio_values", "audio_mask"] + + def __init__( + self, + spectrogram_length=2048, + num_channels=1, + patch_size=[16, 16], + feature_size=128, + sampling_rate=44100, + hop_length_to_sampling_rate=86, + n_fft=2048, + padding_value=0.0, + **kwargs, + ): + super().__init__( + feature_size=feature_size, + sampling_rate=sampling_rate, + padding_value=padding_value, + **kwargs, + ) + + self.spectrogram_length = spectrogram_length + self.num_channels = num_channels + self.patch_size = patch_size + self.freq_len = feature_size // self.patch_size[1] + self.n_fft = n_fft + self.hop_length = sampling_rate // hop_length_to_sampling_rate + self.sampling_rate = sampling_rate + self.padding_value = padding_value + self.mel_filters = mel_filter_bank( + num_frequency_bins=1 + n_fft // 2, + num_mel_filters=feature_size, + min_frequency=0.0, + max_frequency=22050.0, + sampling_rate=sampling_rate, + norm="slaney", + mel_scale="slaney", + ).T + + def _np_extract_fbank_features(self, waveform: np.ndarray) -> np.ndarray: + """ + Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch + implementation with 1e-5 tolerance. + """ + log_spec = spectrogram( + waveform, + window_function(self.n_fft, "hann"), + frame_length=self.n_fft, + hop_length=self.hop_length, + power=2.0, + mel_filters=self.mel_filters.T, + log_mel="dB", + db_range=80.0, + ) + log_spec = log_spec[:, :-1] + log_spec = log_spec - 20.0 + log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0 + return log_spec + + def __call__( + self, + raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = True, + sampling_rate: Optional[int] = None, + resample: bool = False, + mask_audio: bool = False, + **kwargs, + ) -> BatchFeature: + """ + Main method to prepare one or several audio(s) for the model. + + Args: + raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + return_attention_mask (`bool`, *optional*, default to `True`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific feature_extractor's default. [What are attention masks?](../glossary#attention-mask) + + + + For TvltTransformer models, `attention_mask` should always be passed for batched inference, to avoid + subtle bugs. + + + + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition + pipeline. Current model supports sampling rate 16000 and 44100. + resample (`bool`, *optional*, defaults to `False`): + If the sampling rate is not matched, resample the input audio to match. + mask_audio (`bool`, *optional*, defaults to `False`): + Whether or not to mask input audio for MAE task. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **audio_values** -- Audio values to be fed to a model, of shape (batch_size, num_channels, height, + width). + + - **audio_mask** -- Audio masks to be fed to a model, of shape (batch_size, num_audio_patches). + """ + + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + "This feature extractor is set to support sampling rate" + f" of {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled" + f" with {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + "It is strongly recommended to pass the `sampling_rate` argument to this function. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") + is_batched = is_batched_numpy or ( + isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) + ) + if is_batched: + raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech, dtype=np.float32) + elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64): + raw_speech = raw_speech.astype(np.float32) + # always return batch + if not is_batched: + raw_speech = [np.asarray([raw_speech]).T] + + # Convert audio signals to log mel spectrograms, truncate by time axis + audio_features = [ + self._np_extract_fbank_features(waveform.squeeze()).T[: self.spectrogram_length] for waveform in raw_speech + ] + if isinstance(audio_features[0], list): + audio_features = [np.asarray(feature, dtype=np.float32) for feature in audio_features] + + # Create audio attention mask + max_patch_len = max( + ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features + ) # The maximum number of audio patches in a batch + if return_attention_mask: + audio_mask = [ + (ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [1] + + (max_patch_len - ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [0] + for feature in audio_features + ] + audio_mask = np.array(audio_mask).astype(np.float32) + + # convert into correct format for padding + max_time_len = max_patch_len // self.freq_len * self.patch_size[0] # The maximum audio size in a batch + padded_audio_features = np.ones([len(audio_features), 1, max_time_len, self.feature_size]).astype(np.float32) + padded_audio_features = padded_audio_features * self.padding_value + for i in range(len(audio_features)): + feature = audio_features[i] + padded_audio_features[i, :, : feature.shape[0], :] = feature + + # return as BatchFeature + if return_attention_mask: + data = {"audio_values": padded_audio_features, "audio_mask": audio_mask} + else: + data = {"audio_values": padded_audio_features} + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + return encoded_inputs + + +__all__ = ["TvltFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py new file mode 100644 index 0000000000000000000000000000000000000000..01fb42429a96fdb8b171dbeedbd21e45a66a718f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py @@ -0,0 +1,438 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for TVLT.""" + +from typing import Optional, Union + +import numpy as np + +from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ....image_transforms import ( + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ....image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + is_valid_image, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ....utils import TensorType, logging + + +logger = logging.get_logger(__name__) + + +def make_batched(videos) -> list[list[ImageInput]]: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)): + return videos + + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + videos_dim = np.array(videos[0]).ndim + if videos_dim == 3: + return [videos] + elif videos_dim == 4: + return videos + + elif is_valid_image(videos): + videos_dim = np.array(videos).ndim + if videos_dim == 3: + return [[videos]] + elif videos_dim == 4: + return [videos] + elif videos_dim == 5: + return videos + + raise ValueError(f"Could not make batched video from {videos}") + + +class TvltImageProcessor(BaseImageProcessor): + r""" + Constructs a TVLT image processor. + + This processor can be used to prepare either videos or images for the model by converting images to 1-frame videos. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the output image after resizing. The shortest edge of the image will be resized to + `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by + `size` in the `preprocess` method. + patch_size (`list[int]` *optional*, defaults to [16,16]): + The patch size of image patch embedding. + num_frames (`int` *optional*, defaults to 8): + The maximum number of video frames. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop` + parameter in the `preprocess` method. + crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the + `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to 1/255): + Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter + in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = [ + "pixel_values", + "pixel_mask", + "pixel_values_mixed", + "pixel_mask_mixed", + ] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + patch_size: list[int] = [16, 16], + num_frames: int = 8, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_MEAN, + image_std: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_STD, + init_mask_generator=False, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.patch_size = patch_size + self.num_frames = num_frames + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self._valid_processor_keys = [ + "videos", + "do_resize", + "size", + "patch_size", + "num_frames", + "resample", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "is_mixed", + "return_tensors", + "data_format", + "input_data_format", + ] + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will + have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its + shortest edge of length `s` while keeping the aspect ratio of the original image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size, default_to_square=False) + if "shortest_edge" in size: + output_size = get_resize_output_image_size( + image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + output_size = (size["height"], size["width"]) + else: + raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}") + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def _preprocess_image( + self, + image: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[dict[str, int]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a single image.""" + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # All transformations expect numpy arrays. + image = to_numpy_array(image) + + if do_rescale and is_scaled_image(image): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + return image + + def preprocess( + self, + videos: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + patch_size: Optional[list[int]] = None, + num_frames: Optional[int] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[dict[str, int]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + is_mixed: bool = False, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an videos or image or batch of videos or images. + + Args: + videos (`ImageInput`): + Images or videos to preprocess. Expects a single or batch of frames with pixel values ranging from 0 to + 255. If passing in frames with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after applying resize. + patch_size (`list[int]` *optional*, defaults to self.patch_size): + The patch size of image patch embedding. + num_frames (`int` *optional*, defaults to self.num_frames): + The maximum number of video frames. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`): + Whether to centre crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the image after applying the centre crop. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation. + is_mixed (`bool`, *optional*): + If the input video has negative samples. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the inferred channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height, + width). + + - **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches). + + - **pixel_values_mixed** -- Pixel values with both positive or negative to be fed to a model, of shape + (batch_size, num_channels, height, width). + + - **pixel_mask_mixed** -- Pixel masks with both positive or negative to be fed to a model, of shape + (batch_size, num_pixel_patches). + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size") + patch_size = patch_size if patch_size is not None else self.patch_size + num_frames = num_frames if patch_size is not None else self.num_frames + + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + if not valid_images(videos): + raise ValueError( + "Invalid image or video type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + videos = make_batched(videos) + + # Check number of frames is fewer than maximum frames + for video in videos: + if len(video) > self.num_frames: + raise ValueError( + f"number of frames must not be greater than the maximum frames of the model {self.num_frames}." + ) + + max_num_frames = max(len(video) for video in videos) + num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2 + video_masks = np.array( + [ + len(video) * num_patches_per_image * [1] + (max_num_frames - len(video)) * num_patches_per_image * [0] + for video in videos + ] + ) + + videos = [ + [ + self._preprocess_image( + image=img, + do_resize=do_resize, + size=size, + resample=resample, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + input_data_format=input_data_format, + ) + for img in video + ] + for video in videos + ] + + # If videos contain both positive/negative, use mixed key for video-audio matching task + if is_mixed: + data = {"pixel_values_mixed": videos, "pixel_mask_mixed": video_masks} + else: + data = {"pixel_values": videos, "pixel_mask": video_masks} + + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["TvltImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py new file mode 100644 index 0000000000000000000000000000000000000000..2b21df928ff3d74022d858f7a73578802127dd08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py @@ -0,0 +1,1274 @@ +# coding=utf-8 +# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch TVLT model.""" + +import collections.abc +import math +from copy import deepcopy +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutput, SequenceClassifierOutput +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_tvlt import TvltConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "TvltConfig" +_CHECKPOINT_FOR_DOC = "ZinengTang/tvlt-base" + + +@dataclass +class TvltModelOutput(ModelOutput): + """ + Class for TvltModel's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`): + Pixel sequence of hidden-states at the output of the last layer of the model. + last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`): + Audio sequence of hidden-states at the output of the last layer of the model. + pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`): + Tensor indicating which pixel patches are masked (1) and which are not (0). + audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`): + Tensor indicating which audio patches are masked (1) and which are not (0). + pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`): + Tensor containing the ids permutation of pixel masking. + audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`): + Tensor containing the ids permutation of audio masking. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + last_pixel_hidden_state: Optional[torch.FloatTensor] = None + last_audio_hidden_state: Optional[torch.FloatTensor] = None + pixel_label_masks: Optional[torch.LongTensor] = None + audio_label_masks: Optional[torch.LongTensor] = None + pixel_ids_restore: Optional[torch.LongTensor] = None + audio_ids_restore: Optional[torch.LongTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class TvltDecoderOutput(ModelOutput): + """ + Class for TvltDecoder's outputs, with potential hidden states and attentions. + + Args: + logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`): + Pixel reconstruction logits. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class TvltForPreTrainingOutput(ModelOutput): + """ + Class for TvltForPreTraining's outputs, with potential hidden states and attentions. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`): + Pixel reconstruction loss. + matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`): + Matching objective logits. + pixel_logits (`torch.FloatTensor` of shape + `(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction + logits. + audio_logits (`torch.FloatTensor` of shape + `(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction + logits. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + matching_logits: Optional[torch.FloatTensor] = None + pixel_logits: Optional[torch.FloatTensor] = None + audio_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +def generate_pixel_mask_noise(pixel_values, pixel_mask=None, mask_ratio=0.75): + """Generate noise for audio masking.""" + + batch_size, seq_len = pixel_values.shape[:2] + noise = torch.rand((batch_size, seq_len), device=pixel_values.device) # noise in [0, 1] + len_keep = int(seq_len * (1 - mask_ratio)) + return noise, len_keep + + +def generate_audio_mask_noise(audio_values, audio_mask=None, mask_ratio=0.75, mask_type="patch-level", freq_len=8): + """Generate noise for audio masking.""" + + batch_size, seq_len = audio_values.shape[:2] + if mask_type == "frame-level": + num_time_patches = seq_len // freq_len + noise = ( + torch.rand(batch_size, num_time_patches, device=audio_values.device) + .unsqueeze(-1) + .repeat(1, 1, freq_len) + .view(batch_size, seq_len) + ) # noise in [0, 1] + elif mask_type == "patch-level": + noise = torch.rand(batch_size, seq_len, device=audio_values.device) # noise in [0, 1] + len_keep = int(seq_len * (1 - mask_ratio)) + return noise, len_keep + + +def random_masking(sequence, noise, len_keep, attention_masks=None): + """ + Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random + noise. sequence: [batch_size, seq_len, hidden_dim], sequence + """ + + batch_size, seq_len, hidden_dim = sequence.shape + + # sort noise for each sample + ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, hidden_dim)) + + # generate the binary mask: 0 is keep, 1 is remove + label_masks = torch.ones([batch_size, seq_len], device=sequence.device) + label_masks[:, :len_keep] = 0 + # unshuffle to get the binary mask + label_masks = torch.gather(label_masks, dim=1, index=ids_restore) + + if attention_masks is not None: + label_masks *= attention_masks + attention_masks = torch.gather(attention_masks, dim=1, index=ids_keep) + + return sequence_masked, attention_masks, label_masks, ids_restore + + +class TvltPixelEmbeddings(nn.Module): + """Construct the patch and position embeddings.""" + + def __init__(self, config): + super().__init__() + + self.patch_embeddings = TvltPixelPatchEmbeddings(config) + self.num_patches_per_image = self.patch_embeddings.num_patches_per_image + + self.type_embed_v = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size)) + self.pos_embed_v = nn.Parameter(torch.zeros(1, self.num_patches_per_image, config.hidden_size)) + + self.config = config + + def forward(self, pixel_values, attention_masks=None): + # create patch embeddings + batch_size, num_frames, num_channels, height, width = pixel_values.shape + + embeddings = self.patch_embeddings(pixel_values) + embeddings += self.pos_embed_v.repeat(1, num_frames, 1) + embeddings += torch.repeat_interleave(self.temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1) + embeddings += self.type_embed_v + + return embeddings, attention_masks + + +class TvltAudioEmbeddings(nn.Module): + """Construct the patch and position embeddings.""" + + def __init__(self, config): + super().__init__() + + self.patch_embeddings = TvltAudioPatchEmbeddings(config) + self.num_patches = self.patch_embeddings.num_patches + + self.type_embed_a = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.num_freq_patches = config.frequency_length // config.audio_patch_size[1] + self.pos_embed_a = nn.Parameter(torch.zeros(1, self.num_patches // self.num_freq_patches, config.hidden_size)) + self.freq_embed = nn.Parameter(torch.zeros(1, self.num_freq_patches, config.hidden_size)) + + self.num_freq_patches = config.frequency_length // config.audio_patch_size[1] + self.config = config + + def forward(self, audio_values, attention_masks=None): + # create patch embeddings + embeddings = self.patch_embeddings(audio_values) + + num_time_patches = embeddings.size(1) // self.num_freq_patches + embeddings += self.freq_embed.repeat(1, num_time_patches, 1) + embeddings += torch.repeat_interleave(self.pos_embed_a[:, :num_time_patches], self.num_freq_patches, dim=1) + embeddings += self.type_embed_a + + return embeddings, attention_masks + + +class TvltPixelPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.image_patch_size + num_channels, hidden_size = config.num_image_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches_per_image = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches_per_image = num_patches_per_image + self.hidden_size = hidden_size + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_frames, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." + ) + + pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + embeddings = embeddings.reshape(batch_size, num_frames * self.num_patches_per_image, self.hidden_size) + + return embeddings + + +class TvltAudioPatchEmbeddings(nn.Module): + """ + This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + spectrogram_length, frequency_length, patch_size = ( + config.spectrogram_length, + config.frequency_length, + config.audio_patch_size, + ) + num_channels, hidden_size = config.num_audio_channels, config.hidden_size + + spectrogram_size = (spectrogram_length, frequency_length) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0]) + patch_shape = (spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1]) + self.spectrogram_size = spectrogram_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.patch_shape = patch_shape + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, audio_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = audio_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + if height > self.spectrogram_size[0] or width != self.spectrogram_size[1]: + raise ValueError( + f"Input audio size ({height}*{width}) doesn't match model" + f" ({self.spectrogram_size[0]}*{self.spectrogram_size[1]})." + ) + embeddings = self.projection(audio_values).flatten(2).transpose(1, 2) + + return embeddings + + +class TvltSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False): + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class TvltSelfOutput(nn.Module): + """ + The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: TvltConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class TvltAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = TvltSelfAttention(config) + self.output = TvltSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False): + self_outputs = self.attention(hidden_states, attention_mask, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class TvltIntermediate(nn.Module): + def __init__(self, config: TvltConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +class TvltOutput(nn.Module): + def __init__(self, config: TvltConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states + input_tensor + + return hidden_states + + +class TvltLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = TvltAttention(config) + self.intermediate = TvltIntermediate(config) + self.output = TvltOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False): + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in ViLT, layernorm is applied before self-attention + attention_mask, + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = attention_output + hidden_states.to(attention_output.device) + + # in ViLT, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + outputs = (layer_output,) + outputs + + return outputs + + +class TvltEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([TvltLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class TvltPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: TvltConfig + base_model_prefix = "tvlt" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +TVLT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`TvltConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TVLT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for + details. + + audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Audio values. Audio values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for + details. + + pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`): + Pixel masks. Pixel masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for + details. + + audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`): + Audio masks. Audio masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for + details. + + pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): + Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Pixel values mixed can + be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details. + + pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel masks of pixel_values_mixed. Pixel masks mixed can be obtained using [`TvltProcessor`]. See + [`TvltProcessor.__call__`] for details. + + mask_pixel (`bool`, *optional*): + Whether to mask pixel for MAE tasks. Only set to True in TvltForPreTraining. + + mask_audio (`bool`, *optional*): + Whether to mask audio for MAE tasks. Only set to True in TvltForPreTraining. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare TVLT Model transformer outputting raw hidden-states without any specific head on top.", + TVLT_START_DOCSTRING, +) +class TvltModel(TvltPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.pixel_embeddings = TvltPixelEmbeddings(config) + self.audio_embeddings = TvltAudioEmbeddings(config) + self.encoder = TvltEncoder(config) + + self.cls_embedding = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + + if config.use_mean_pooling: + self.layernorm = None + else: + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.pixel_embeddings.patch_embeddings, self.audio_embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TvltModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + audio_values: torch.FloatTensor, + pixel_mask: Optional[torch.FloatTensor] = None, + audio_mask: Optional[torch.FloatTensor] = None, + mask_pixel: bool = False, + mask_audio: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], TvltModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import TvltProcessor, TvltModel + >>> import numpy as np + >>> import torch + + >>> num_frames = 8 + >>> images = list(np.random.randn(num_frames, 3, 224, 224)) + >>> audio = list(np.random.randn(10000)) + + >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base") + >>> model = TvltModel.from_pretrained("ZinengTang/tvlt-base") + + >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt") + + >>> outputs = model(**input_dict) + >>> loss = outputs.loss + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + pixel_embedding_output, pixel_mask = self.pixel_embeddings(pixel_values, pixel_mask) + + audio_embedding_output, audio_mask = self.audio_embeddings(audio_values, audio_mask) + + # Mask pixel if mask_pixel is True + pixel_label_masks = None + pixel_ids_restore = None + if mask_pixel: + pixel_mask_noise, pixel_len_keep = generate_pixel_mask_noise( + pixel_embedding_output, pixel_mask=pixel_mask, mask_ratio=self.config.pixel_mask_ratio + ) + pixel_embedding_output, pixel_mask, pixel_label_masks, pixel_ids_restore = random_masking( + pixel_embedding_output, + pixel_mask_noise, + pixel_len_keep, + attention_masks=pixel_mask, + ) + + # Mask audio if mask_audio is True + audio_label_masks = None + audio_ids_restore = None + if mask_audio: + num_freq_patches = self.config.frequency_length // self.config.audio_patch_size[1] + audio_mask_noise, audio_len_keep = generate_audio_mask_noise( + audio_embedding_output, + audio_mask=audio_mask, + mask_ratio=self.config.audio_mask_ratio, + mask_type=self.config.audio_mask_type, + freq_len=num_freq_patches, + ) + audio_embedding_output, audio_mask, audio_label_masks, audio_ids_restore = random_masking( + audio_embedding_output, + audio_mask_noise, + audio_len_keep, + attention_masks=audio_mask, + ) + + # Prepare for encoder inputs and attention masks + batch_size = pixel_values.size(0) + embedding_output = torch.cat( + [self.cls_embedding.repeat(batch_size, 1, 1), pixel_embedding_output, audio_embedding_output], 1 + ) + masked_pixel_len = pixel_embedding_output.size(1) + + attention_mask = None + if pixel_mask is not None and audio_mask is not None: + attention_mask = torch.cat([pixel_mask[:, :1], pixel_mask, audio_mask], 1) + + input_shape = embedding_output.size() + extended_attention_mask = None + if attention_mask is not None: + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + if self.layernorm is not None: + sequence_output = self.layernorm(sequence_output) + + pixel_sequence_output = sequence_output[:, 1 : 1 + masked_pixel_len] + audio_sequence_output = sequence_output[:, 1 + masked_pixel_len :] + if not return_dict: + return ( + sequence_output, + pixel_sequence_output, + audio_sequence_output, + pixel_label_masks, + audio_label_masks, + pixel_ids_restore, + audio_ids_restore, + ) + encoder_outputs[1:] + + return TvltModelOutput( + last_hidden_state=sequence_output, + last_pixel_hidden_state=pixel_sequence_output, + last_audio_hidden_state=audio_sequence_output, + pixel_label_masks=pixel_label_masks, + audio_label_masks=audio_label_masks, + pixel_ids_restore=pixel_ids_restore, + audio_ids_restore=audio_ids_restore, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class TvltDecoder(nn.Module): + def __init__(self, config): + super().__init__() + + decoder_config = deepcopy(config) + decoder_config.hidden_size = config.decoder_hidden_size + decoder_config.num_hidden_layers = config.decoder_num_hidden_layers + decoder_config.num_attention_heads = config.decoder_num_attention_heads + decoder_config.intermediate_size = config.decoder_intermediate_size + self.decoder_layers = nn.ModuleList( + [TvltLayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)] + ) + + self.layernorm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps) + + self.gradient_checkpointing = False + self.config = config + + def forward( + self, + hidden_states, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + # apply Transformer layers (blocks) + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + for i, layer_module in enumerate(self.decoder_layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module(hidden_states, output_attentions=output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # predictor projection + logits = self.layernorm(hidden_states) + + if not return_dict: + return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None) + return TvltDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions) + + +@add_start_docstrings( + "The TVLT Model transformer with the decoder on top for self-supervised pre-training.", + TVLT_START_DOCSTRING, +) +class TvltForPreTraining(TvltPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.task_matching = config.task_matching + self.task_mae = config.task_mae + if not (self.task_matching or self.task_mae): + raise ValueError("Must set at least one of matching task and MAE task to true") + + self.tvlt = TvltModel(config) + + if self.task_matching: + self.matching_head = TvltMatchingHead(config) + + if self.task_mae: + self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True) + + self.pixel_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size)) + self.audio_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size)) + + self.decoder = TvltDecoder(config) + + decoder_hidden_size = config.decoder_hidden_size + + num_frames = config.num_frames + num_patches_per_image = self.tvlt.pixel_embeddings.num_patches_per_image + self.decoder_pixel_pos_embed = nn.Parameter(torch.zeros(1, num_patches_per_image, decoder_hidden_size)) + self.decoder_temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, decoder_hidden_size)) + self.decoder_pixel_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size)) + + num_audio_patches = self.tvlt.audio_embeddings.num_patches + num_freq_patches = config.frequency_length // config.audio_patch_size[1] + self.decoder_audio_pos_embed = nn.Parameter( + torch.zeros(1, num_audio_patches // num_freq_patches, decoder_hidden_size) + ) + self.decoder_freq_embed = nn.Parameter(torch.zeros(1, num_freq_patches, decoder_hidden_size)) + self.decoder_audio_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size)) + + pixel_mae_output_dim = self.config.image_patch_size[0] ** 2 * self.config.num_image_channels + self.pixel_mae_head = TvltMAEHead(config, pixel_mae_output_dim) + audio_mae_output_dim = ( + self.config.audio_patch_size[0] * self.config.audio_patch_size[1] * self.config.num_audio_channels + ) + self.audio_mae_head = TvltMAEHead(config, audio_mae_output_dim) + + self.num_frames = num_frames + self.num_patches_per_image = num_patches_per_image + self.num_freq_patches = num_freq_patches + self.image_patch_size = config.image_patch_size + self.audio_patch_size = config.audio_patch_size + + # Initialize weights and apply final processing + self.post_init() + + def patchify_pixel(self, pixel_values): + """ + pixel_values: [batch_size, num_frames, 3, height, width] + """ + batch_size, num_frames, num_channels, height, width = pixel_values.shape + num_patches_height = pixel_values.shape[3] // self.image_patch_size[0] + num_patches_width = pixel_values.shape[4] // self.image_patch_size[1] + patchified_pixel_values = pixel_values.reshape( + shape=( + batch_size, + num_frames, + num_channels, + num_patches_height, + self.image_patch_size[0], + num_patches_width, + self.image_patch_size[1], + ) + ) + patchified_pixel_values = torch.einsum("ntchpwq->nthwpqc", patchified_pixel_values) + patchified_pixel_values = patchified_pixel_values.reshape( + shape=( + batch_size, + num_patches_height * num_patches_width * num_frames, + self.image_patch_size[0] * self.image_patch_size[1] * num_channels, + ) + ) + return patchified_pixel_values + + def patchify_audio(self, audio_values): + """ + audio_values: [batch_size, 1, height, width] + """ + batch_size, num_channels, height, width = audio_values.shape + num_patches_height = height // self.audio_patch_size[0] + num_patches_width = width // self.audio_patch_size[1] + patchified_audio_values = audio_values.reshape( + shape=( + batch_size, + num_channels, + num_patches_height, + self.audio_patch_size[0], + num_patches_width, + self.audio_patch_size[1], + ) + ) + patchified_audio_values = torch.einsum("nchpwq->nhwpqc", patchified_audio_values) + patchified_audio_values = patchified_audio_values.reshape( + shape=( + batch_size, + num_patches_height * num_patches_width, + self.audio_patch_size[0] * self.audio_patch_size[1] * num_channels, + ) + ) + return patchified_audio_values + + def pixel_mae_loss(self, pixel_values, pixel_predictions, mask): + patchified_pixel_values = self.patchify_pixel(pixel_values) + loss = (pixel_predictions - patchified_pixel_values) ** 2 + loss = loss.mean(dim=-1) # [batch_size, pixel_pixel_length], mean loss per patch + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + def audio_mae_loss(self, audio_values, audio_predictions, mask): + patchified_audio_values = self.patchify_audio(audio_values) + loss = (audio_predictions - patchified_audio_values) ** 2 + loss = loss.mean(dim=-1) # [batch_size, audio_pixel_length], mean loss per patch + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + def concatenate_mask(self, mask_token, sequence, ids_restore): + batch_size, seq_length, dim = sequence.shape + mask_tokens = mask_token.repeat(batch_size, ids_restore.shape[1] - seq_length, 1) + padded_sequence = torch.cat([sequence, mask_tokens], dim=1) + padded_sequence = torch.gather( + padded_sequence, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, dim) + ) # unshuffle + return padded_sequence + + @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TvltForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + audio_values: torch.FloatTensor, + pixel_mask: Optional[torch.FloatTensor] = None, + audio_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + pixel_values_mixed: Optional[torch.FloatTensor] = None, + pixel_mask_mixed: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], TvltForPreTrainingOutput]: + r""" + pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): + Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Audio values can be + obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details. + + pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel masks of pixel_values_mixed. Pixel values mixed can be obtained using [`TvltProcessor`]. See + [`TvltProcessor.__call__`] for details. + + labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*): + Labels for computing the vision audio matching loss. Indices should be in `[0, 1]`. num_labels has to be 1. + + Return: + + Examples: + + ```python + >>> from transformers import TvltProcessor, TvltForPreTraining + >>> import numpy as np + >>> import torch + + >>> num_frames = 8 + >>> images = list(np.random.randn(num_frames, 3, 224, 224)) + >>> images_mixed = list(np.random.randn(num_frames, 3, 224, 224)) + >>> audio = list(np.random.randn(10000)) + >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base") + >>> model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base") + >>> input_dict = processor( + ... images, audio, images_mixed, sampling_rate=44100, mask_pixel=True, mask_audio=True, return_tensors="pt" + ... ) + + >>> outputs = model(**input_dict) + >>> loss = outputs.loss + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + total_loss = 0.0 + + if self.task_matching: + if labels is None: + raise ValueError("Matching task requires labels") + if pixel_values_mixed is None: + raise ValueError("Matching task requires pixel_values_mixed") + + outputs = self.tvlt( + pixel_values_mixed, + audio_values, + pixel_mask=pixel_mask_mixed, + audio_mask=audio_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + matching_logits = self.matching_head(sequence_output) + + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(matching_logits.view(-1), labels.view(-1)) + total_loss += loss + + pixel_logits = None + audio_logits = None + if self.task_mae and self.training: + outputs = self.tvlt( + pixel_values, + audio_values, + pixel_mask=pixel_mask, + audio_mask=audio_mask, + mask_pixel=True, + mask_audio=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pixel_sequence_output = outputs.last_pixel_hidden_state if return_dict else outputs[1] + audio_sequence_output = outputs.last_audio_hidden_state if return_dict else outputs[2] + pixel_label_masks = outputs.pixel_label_masks if return_dict else outputs[3] + audio_label_masks = outputs.audio_label_masks if return_dict else outputs[4] + pixel_ids_restore = outputs.pixel_ids_restore if return_dict else outputs[5] + audio_ids_restore = outputs.audio_ids_restore if return_dict else outputs[6] + + pixel_decoder_input = self.encoder_to_decoder( + pixel_sequence_output + ) # [batch_size, num_masked_pixel_patches, decoder_hidden_size] + audio_decoder_input = self.encoder_to_decoder( + audio_sequence_output + ) # [batch_size, num_masked_audio_patches, decoder_hidden_size] + num_frames = pixel_values.size(1) + pixel_decoder_input = self.concatenate_mask(self.pixel_mask_token, pixel_decoder_input, pixel_ids_restore) + pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_pos_embed.repeat(1, num_frames, 1) + pixel_decoder_input = pixel_decoder_input + torch.repeat_interleave( + self.decoder_temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1 + ) + pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_type_embed + pixel_decoder_outputs = self.decoder(pixel_decoder_input) + pixel_logits = self.pixel_mae_head(pixel_decoder_outputs.logits) + + audio_decoder_input = self.concatenate_mask(self.audio_mask_token, audio_decoder_input, audio_ids_restore) + num_time_patches = audio_decoder_input.size(1) // self.num_freq_patches + audio_decoder_input = audio_decoder_input + self.decoder_freq_embed.repeat(1, num_time_patches, 1) + audio_decoder_input = audio_decoder_input + torch.repeat_interleave( + self.decoder_audio_pos_embed[:, :num_time_patches], self.num_freq_patches, dim=1 + ) + audio_decoder_input = audio_decoder_input + self.decoder_audio_type_embed + audio_decoder_outputs = self.decoder(audio_decoder_input) + audio_logits = self.audio_mae_head(audio_decoder_outputs.logits) + + loss = self.pixel_mae_loss(pixel_values, pixel_logits, pixel_label_masks) + self.audio_mae_loss( + audio_values, audio_logits, audio_label_masks + ) + total_loss += loss + + if not return_dict: + output = (matching_logits, pixel_logits, audio_logits) + outputs[7:] + return ((total_loss,) + output) if loss is not None else output + + return TvltForPreTrainingOutput( + loss=total_loss, + matching_logits=matching_logits, + pixel_logits=pixel_logits, + audio_logits=audio_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class TvltPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class TvltMatchingHead(nn.Module): + def __init__(self, config): + super().__init__() + self.pooler = TvltPooler(config) + self.fc = nn.Linear(config.hidden_size, 1) + + def forward(self, hidden_states): + hidden_states = self.fc(self.pooler(hidden_states)) + return hidden_states + + +class TvltMAEHead(nn.Module): + def __init__(self, config, output_dim=None): + super().__init__() + self.config = config + self.decoder = nn.Linear(config.decoder_hidden_size, output_dim) + + def forward(self, hidden_states): + hidden_states = self.decoder(hidden_states) + return hidden_states + + +@add_start_docstrings( + """ + Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token) + for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval. + """, + TVLT_START_DOCSTRING, +) +class TvltForAudioVisualClassification(TvltPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.tvlt = TvltModel(config) + + # Classifier head + self.classifier = nn.Sequential( + nn.Linear(config.hidden_size, config.hidden_size * 2), + nn.LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps), + nn.GELU(), + nn.Linear(config.hidden_size * 2, config.num_labels), + ) + self.config = config + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + audio_values: torch.FloatTensor, + pixel_mask: Optional[torch.FloatTensor] = None, + audio_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*): + Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes + refers to the number of classes in audiovisual tasks. + + Return: + + Examples: + ```python + >>> from transformers import TvltProcessor, TvltForAudioVisualClassification + >>> import numpy as np + >>> import torch + + >>> num_frames = 8 + >>> images = list(np.random.randn(num_frames, 3, 224, 224)) + >>> audio = list(np.random.randn(10000)) + >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base") + >>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base") + >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt") + + >>> outputs = model(**input_dict) + >>> loss = outputs.loss + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.tvlt( + pixel_values, + audio_values, + pixel_mask=pixel_mask, + audio_mask=audio_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0][:, 0] + logits = self.classifier(sequence_output) # rank value + + loss = None + if labels is not None: + if self.config.loss_type == "regression": + loss_fct = MSELoss() + loss = loss_fct(logits, labels) + elif self.config.loss_type == "classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[4:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["TvltModel", "TvltForPreTraining", "TvltForAudioVisualClassification", "TvltPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py new file mode 100644 index 0000000000000000000000000000000000000000..1ecbac0dfa16f7c2cf70d07775bba2beeb456973 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py @@ -0,0 +1,86 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for TVLT. +""" + +from ....processing_utils import ProcessorMixin + + +class TvltProcessor(ProcessorMixin): + r""" + Constructs a TVLT processor which wraps a TVLT image processor and TVLT feature extractor into a single processor. + + [`TvltProcessor`] offers all the functionalities of [`TvltImageProcessor`] and [`TvltFeatureExtractor`]. See the + docstring of [`~TvltProcessor.__call__`] for more information. + + Args: + image_processor (`TvltImageProcessor`): + An instance of [`TvltImageProcessor`]. The image processor is a required input. + feature_extractor (`TvltFeatureExtractor`): + An instance of [`TvltFeatureExtractor`]. The feature extractor is a required input. + """ + + attributes = ["image_processor", "feature_extractor"] + image_processor_class = "TvltImageProcessor" + feature_extractor_class = "TvltFeatureExtractor" + + def __init__(self, image_processor, feature_extractor): + super().__init__(image_processor=image_processor, feature_extractor=feature_extractor) + + self.image_processor = image_processor + self.feature_extractor = feature_extractor + + def __call__( + self, + images=None, + audio=None, + images_mixed=None, + sampling_rate=None, + mask_audio=False, + mask_pixel=False, + *args, + **kwargs, + ): + """ + Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio` + argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the + above two methods for more information. + """ + + if images is None and audio is None: + raise ValueError("You need to specify either an `images` or `audio` input to process.") + + images_mixed_dict = None + if images is not None: + images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs) + if images_mixed is not None: + images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs) + if audio is not None: + audio_dict = self.feature_extractor( + audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs + ) + + output_dict = {} + if audio is not None: + output_dict.update(audio_dict) + if images is not None: + output_dict.update(images_dict) + if images_mixed_dict is not None: + output_dict.update(images_mixed_dict) + return output_dict + + +__all__ = ["TvltProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9552c827365d3913539be3eafaf822146ada5829 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_van import * + from .modeling_van import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/configuration_van.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/configuration_van.py new file mode 100644 index 0000000000000000000000000000000000000000..be94e45ec63d9516584c320c98e0b1157c092c2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/configuration_van.py @@ -0,0 +1,110 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VAN model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class VanConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`VanModel`]. It is used to instantiate a VAN model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the VAN + [Visual-Attention-Network/van-base](https://huggingface.co/Visual-Attention-Network/van-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3, 3]`): + Patch size to use in each stage's embedding layer. + strides (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`): + Stride size to use in each stage's embedding layer to downsample the input. + hidden_sizes (`list[int]`, *optional*, defaults to `[64, 128, 320, 512]`): + Dimensionality (hidden size) at each stage. + depths (`list[int]`, *optional*, defaults to `[3, 3, 12, 3]`): + Depth (number of layers) for each stage. + mlp_ratios (`list[int]`, *optional*, defaults to `[8, 8, 4, 4]`): + The expansion ratio for mlp layer at each stage. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in each layer. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + layer_scale_init_value (`float`, *optional*, defaults to 0.01): + The initial value for layer scaling. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The dropout probability for stochastic depth. + dropout_rate (`float`, *optional*, defaults to 0.0): + The dropout probability for dropout. + + Example: + ```python + >>> from transformers import VanModel, VanConfig + + >>> # Initializing a VAN van-base style configuration + >>> configuration = VanConfig() + >>> # Initializing a model from the van-base style configuration + >>> model = VanModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "van" + + def __init__( + self, + image_size=224, + num_channels=3, + patch_sizes=[7, 3, 3, 3], + strides=[4, 2, 2, 2], + hidden_sizes=[64, 128, 320, 512], + depths=[3, 3, 12, 3], + mlp_ratios=[8, 8, 4, 4], + hidden_act="gelu", + initializer_range=0.02, + layer_norm_eps=1e-6, + layer_scale_init_value=1e-2, + drop_path_rate=0.0, + dropout_rate=0.0, + **kwargs, + ): + super().__init__(**kwargs) + self.image_size = image_size + self.num_channels = num_channels + self.patch_sizes = patch_sizes + self.strides = strides + self.hidden_sizes = hidden_sizes + self.depths = depths + self.mlp_ratios = mlp_ratios + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.layer_scale_init_value = layer_scale_init_value + self.drop_path_rate = drop_path_rate + self.dropout_rate = dropout_rate + + +__all__ = ["VanConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/modeling_van.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/modeling_van.py new file mode 100644 index 0000000000000000000000000000000000000000..c0fc0bc1a63728ea54935775b085dd8041e46a45 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/modeling_van.py @@ -0,0 +1,520 @@ +# coding=utf-8 +# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Visual Attention Network (VAN) model.""" + +import math +from collections import OrderedDict +from typing import Optional, Union + +import torch +from torch import nn + +from ....activations import ACT2FN +from ....modeling_outputs import ( + BaseModelOutputWithNoAttention, + BaseModelOutputWithPoolingAndNoAttention, + ImageClassifierOutputWithNoAttention, +) +from ....modeling_utils import PreTrainedModel +from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_van import VanConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "VanConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base" +_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class VanDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class VanOverlappingPatchEmbedder(nn.Module): + """ + Downsamples the input using a patchify operation with a `stride` of 4 by default making adjacent windows overlap by + half of the area. From [PVTv2: Improved Baselines with Pyramid Vision + Transformer](https://huggingface.co/papers/2106.13797). + """ + + def __init__(self, in_channels: int, hidden_size: int, patch_size: int = 7, stride: int = 4): + super().__init__() + self.convolution = nn.Conv2d( + in_channels, hidden_size, kernel_size=patch_size, stride=stride, padding=patch_size // 2 + ) + self.normalization = nn.BatchNorm2d(hidden_size) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + hidden_state = self.convolution(input) + hidden_state = self.normalization(hidden_state) + return hidden_state + + +class VanMlpLayer(nn.Module): + """ + MLP with depth-wise convolution, from [PVTv2: Improved Baselines with Pyramid Vision + Transformer](https://huggingface.co/papers/2106.13797). + """ + + def __init__( + self, + in_channels: int, + hidden_size: int, + out_channels: int, + hidden_act: str = "gelu", + dropout_rate: float = 0.5, + ): + super().__init__() + self.in_dense = nn.Conv2d(in_channels, hidden_size, kernel_size=1) + self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=3, padding=1, groups=hidden_size) + self.activation = ACT2FN[hidden_act] + self.dropout1 = nn.Dropout(dropout_rate) + self.out_dense = nn.Conv2d(hidden_size, out_channels, kernel_size=1) + self.dropout2 = nn.Dropout(dropout_rate) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.in_dense(hidden_state) + hidden_state = self.depth_wise(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.dropout1(hidden_state) + hidden_state = self.out_dense(hidden_state) + hidden_state = self.dropout2(hidden_state) + return hidden_state + + +class VanLargeKernelAttention(nn.Module): + """ + Basic Large Kernel Attention (LKA). + """ + + def __init__(self, hidden_size: int): + super().__init__() + self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=5, padding=2, groups=hidden_size) + self.depth_wise_dilated = nn.Conv2d( + hidden_size, hidden_size, kernel_size=7, dilation=3, padding=9, groups=hidden_size + ) + self.point_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=1) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.depth_wise(hidden_state) + hidden_state = self.depth_wise_dilated(hidden_state) + hidden_state = self.point_wise(hidden_state) + return hidden_state + + +class VanLargeKernelAttentionLayer(nn.Module): + """ + Computes attention using Large Kernel Attention (LKA) and attends the input. + """ + + def __init__(self, hidden_size: int): + super().__init__() + self.attention = VanLargeKernelAttention(hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + attention = self.attention(hidden_state) + attended = hidden_state * attention + return attended + + +class VanSpatialAttentionLayer(nn.Module): + """ + Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention -> + projection (via conv) + residual connection. + """ + + def __init__(self, hidden_size: int, hidden_act: str = "gelu"): + super().__init__() + self.pre_projection = nn.Sequential( + OrderedDict( + [ + ("conv", nn.Conv2d(hidden_size, hidden_size, kernel_size=1)), + ("act", ACT2FN[hidden_act]), + ] + ) + ) + self.attention_layer = VanLargeKernelAttentionLayer(hidden_size) + self.post_projection = nn.Conv2d(hidden_size, hidden_size, kernel_size=1) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.pre_projection(hidden_state) + hidden_state = self.attention_layer(hidden_state) + hidden_state = self.post_projection(hidden_state) + hidden_state = hidden_state + residual + return hidden_state + + +class VanLayerScaling(nn.Module): + """ + Scales the inputs by a learnable parameter initialized by `initial_value`. + """ + + def __init__(self, hidden_size: int, initial_value: float = 1e-2): + super().__init__() + self.weight = nn.Parameter(initial_value * torch.ones(hidden_size), requires_grad=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + # unsqueezing for broadcasting + hidden_state = self.weight.unsqueeze(-1).unsqueeze(-1) * hidden_state + return hidden_state + + +class VanLayer(nn.Module): + """ + Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP). + """ + + def __init__( + self, + config: VanConfig, + hidden_size: int, + mlp_ratio: int = 4, + drop_path_rate: float = 0.5, + ): + super().__init__() + self.drop_path = VanDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.pre_normomalization = nn.BatchNorm2d(hidden_size) + self.attention = VanSpatialAttentionLayer(hidden_size, config.hidden_act) + self.attention_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value) + self.post_normalization = nn.BatchNorm2d(hidden_size) + self.mlp = VanMlpLayer( + hidden_size, hidden_size * mlp_ratio, hidden_size, config.hidden_act, config.dropout_rate + ) + self.mlp_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + # attention + hidden_state = self.pre_normomalization(hidden_state) + hidden_state = self.attention(hidden_state) + hidden_state = self.attention_scaling(hidden_state) + hidden_state = self.drop_path(hidden_state) + # residual connection + hidden_state = residual + hidden_state + residual = hidden_state + # mlp + hidden_state = self.post_normalization(hidden_state) + hidden_state = self.mlp(hidden_state) + hidden_state = self.mlp_scaling(hidden_state) + hidden_state = self.drop_path(hidden_state) + # residual connection + hidden_state = residual + hidden_state + return hidden_state + + +class VanStage(nn.Module): + """ + VanStage, consisting of multiple layers. + """ + + def __init__( + self, + config: VanConfig, + in_channels: int, + hidden_size: int, + patch_size: int, + stride: int, + depth: int, + mlp_ratio: int = 4, + drop_path_rate: float = 0.0, + ): + super().__init__() + self.embeddings = VanOverlappingPatchEmbedder(in_channels, hidden_size, patch_size, stride) + self.layers = nn.Sequential( + *[ + VanLayer( + config, + hidden_size, + mlp_ratio=mlp_ratio, + drop_path_rate=drop_path_rate, + ) + for _ in range(depth) + ] + ) + self.normalization = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.embeddings(hidden_state) + hidden_state = self.layers(hidden_state) + # rearrange b c h w -> b (h w) c + batch_size, hidden_size, height, width = hidden_state.shape + hidden_state = hidden_state.flatten(2).transpose(1, 2) + hidden_state = self.normalization(hidden_state) + # rearrange b (h w) c- > b c h w + hidden_state = hidden_state.view(batch_size, height, width, hidden_size).permute(0, 3, 1, 2) + return hidden_state + + +class VanEncoder(nn.Module): + """ + VanEncoder, consisting of multiple stages. + """ + + def __init__(self, config: VanConfig): + super().__init__() + self.stages = nn.ModuleList([]) + patch_sizes = config.patch_sizes + strides = config.strides + hidden_sizes = config.hidden_sizes + depths = config.depths + mlp_ratios = config.mlp_ratios + drop_path_rates = [ + x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu") + ] + + for num_stage, (patch_size, stride, hidden_size, depth, mlp_expansion, drop_path_rate) in enumerate( + zip(patch_sizes, strides, hidden_sizes, depths, mlp_ratios, drop_path_rates) + ): + is_first_stage = num_stage == 0 + in_channels = hidden_sizes[num_stage - 1] + if is_first_stage: + in_channels = config.num_channels + self.stages.append( + VanStage( + config, + in_channels, + hidden_size, + patch_size=patch_size, + stride=stride, + depth=depth, + mlp_ratio=mlp_expansion, + drop_path_rate=drop_path_rate, + ) + ) + + def forward( + self, + hidden_state: torch.Tensor, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple, BaseModelOutputWithNoAttention]: + all_hidden_states = () if output_hidden_states else None + + for _, stage_module in enumerate(self.stages): + hidden_state = stage_module(hidden_state) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states) + + +class VanPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: VanConfig + base_model_prefix = "van" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + nn.init.trunc_normal_(module.weight, std=self.config.initializer_range) + if isinstance(module, nn.Linear) and module.bias is not None: + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + elif isinstance(module, nn.Conv2d): + fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels + fan_out //= module.groups + module.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if module.bias is not None: + module.bias.data.zero_() + + +VAN_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`VanConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +VAN_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`ConvNextImageProcessor.__call__`] for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding" + " layer.", + VAN_START_DOCSTRING, +) +class VanModel(VanPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.encoder = VanEncoder(config) + # final layernorm layer + self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps) + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndNoAttention, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor], + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_outputs = self.encoder( + pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = encoder_outputs[0] + # global average pooling, n c w h -> n c + pooled_output = last_hidden_state.mean(dim=[-2, -1]) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@add_start_docstrings( + """ + VAN Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + VAN_START_DOCSTRING, +) +class VanForImageClassification(VanPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.van = VanModel(config) + # Classifier head + self.classifier = ( + nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutputWithNoAttention, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutputWithNoAttention]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.van(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + +__all__ = ["VanForImageClassification", "VanModel", "VanPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5bd93aa4dabea9b2adbc54eeeb3664de589f43a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_vit_hybrid import * + from .image_processing_vit_hybrid import * + from .modeling_vit_hybrid import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..e96c6c4a1b58cd40e86dfac71dfcb9c0eb19f57c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py @@ -0,0 +1,180 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ViT Hybrid model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging +from ...auto.configuration_auto import CONFIG_MAPPING +from ...bit import BitConfig + + +logger = logging.get_logger(__name__) + + +class ViTHybridConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ViTHybridModel`]. It is used to instantiate a ViT + Hybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the ViT Hybrid + [google/vit-hybrid-base-bit-384](https://huggingface.co/google/vit-hybrid-base-bit-384) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the backbone in a dictionary or the config object of the backbone. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 1): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`): + Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + + Example: + + ```python + >>> from transformers import ViTHybridConfig, ViTHybridModel + + >>> # Initializing a ViT Hybrid vit-hybrid-base-bit-384 style configuration + >>> configuration = ViTHybridConfig() + + >>> # Initializing a model (with random weights) from the vit-hybrid-base-bit-384 style configuration + >>> model = ViTHybridModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "vit-hybrid" + + def __init__( + self, + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + backbone_kwargs=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=224, + patch_size=1, + num_channels=3, + backbone_featmap_shape=[1, 1024, 24, 24], + qkv_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + if use_pretrained_backbone: + raise ValueError("Pretrained backbones are not supported yet.") + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: + logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.") + backbone_config = { + "global_padding": "same", + "layer_type": "bottleneck", + "depths": [3, 4, 9], + "out_features": ["stage3"], + "embedding_dynamic_padding": True, + } + + if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: + raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") + + if isinstance(backbone_config, dict): + if "model_type" in backbone_config: + backbone_config_class = CONFIG_MAPPING[backbone_config["model_type"]] + else: + logger.info( + "`model_type` is not found in `backbone_config`. Use `Bit` as the backbone configuration class." + ) + backbone_config_class = BitConfig + backbone_config = backbone_config_class(**backbone_config) + + self.backbone_featmap_shape = backbone_featmap_shape + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.backbone_kwargs = backbone_kwargs + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + +__all__ = ["ViTHybridConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..92d518363b2c34b5b30fd04dc0dbe982c8551e3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py @@ -0,0 +1,341 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for ViT hybrid.""" + +from typing import Optional, Union + +import numpy as np + +from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ....image_transforms import ( + convert_to_rgb, + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ....image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ....utils import TensorType, is_vision_available, logging + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +class ViTHybridImageProcessor(BaseImageProcessor): + r""" + Constructs a ViT Hybrid image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize: + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + self._valid_processor_keys = [ + "images", + "do_resize", + "size", + "resample", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_convert_rgb", + "return_tensors", + "data_format", + "input_data_format", + ] + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: defaults to the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = make_flat_list_of_images(images) + + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["ViTHybridImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..86b1594a20c92b8f2f528714d4ea9f36e93b8e4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py @@ -0,0 +1,740 @@ +# coding=utf-8 +# Copyright 2022 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ViT Hybrid model.""" + +import collections.abc +import math +from typing import Optional, Union + +import torch +from torch import nn + +from ....activations import ACT2FN +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + torch_int, +) +from ....utils.backbone_utils import load_backbone +from .configuration_vit_hybrid import ViTHybridConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "ViTHybridConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384" +_EXPECTED_OUTPUT_SHAPE = [1, 197, 768] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "google/vit-hybrid-base-bit-384" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +class ViTHybridEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + """ + + def __init__(self, config: ViTHybridConfig, use_mask_token: bool = False) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None + self.patch_embeddings = ViTHybridPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + if bool_masked_pos is not None: + seq_length = embeddings.shape[1] + mask_tokens = self.mask_token.expand(batch_size, seq_length, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +class ViTHybridPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config, feature_size=None): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + + self.backbone = load_backbone(config) + if self.backbone.config.model_type != "bit": + raise ValueError(f"Backbone model type {self.backbone.model_type} is not supported.") + feature_dim = self.backbone.channels[-1] + + if feature_size is None: + feature_map = config.backbone_featmap_shape + + feature_size = feature_map[-2:] + feature_dim = feature_map[1] + else: + feature_size = ( + feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size) + ) + feature_dim = self.backbone.channels[-1] + + self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + + self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + _, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + if not interpolate_pos_encoding: + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" + f" ({self.image_size[0]}*{self.image_size[1]})." + ) + + features = self.backbone(pixel_values).feature_maps[-1] + embeddings = self.projection(features).flatten(2).transpose(1, 2) + + return embeddings + + +class ViTHybridSelfAttention(nn.Module): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__(config) + self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + head_mask, + self.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=None, + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, None + + +class ViTHybridSelfOutput(nn.Module): + """ + The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class ViTHybridAttention(nn.Module): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__() + self.attention = ViTHybridSelfAttention(config) + self.output = ViTHybridSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class ViTHybridSdpaAttention(ViTHybridAttention): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__(config) + self.attention = ViTHybridSdpaSelfAttention(config) + + +class ViTHybridIntermediate(nn.Module): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +class ViTHybridOutput(nn.Module): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states + input_tensor + + return hidden_states + + +VIT_HYBRID_ATTENTION_CLASSES = { + "eager": ViTHybridAttention, + "sdpa": ViTHybridSdpaAttention, +} + + +class ViTHybridLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = VIT_HYBRID_ATTENTION_CLASSES[config._attn_implementation](config) + self.intermediate = ViTHybridIntermediate(config) + self.output = ViTHybridOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in ViTHybrid, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + # We assign to correct device for `accelerate`, check: https://github.com/huggingface/transformers/pull/20705/ + hidden_states = attention_output + hidden_states.to(attention_output.device) + + # in ViTHybrid, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + outputs = (layer_output,) + outputs + + return outputs + + +class ViTHybridEncoder(nn.Module): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([ViTHybridLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class ViTHybridPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: ViTHybridConfig + base_model_prefix = "vit" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["ViTHybridEmbeddings", "ViTHybridLayer"] + _supports_sdpa = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, ViTHybridEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + module.mask_token.data.zero_() + + +VIT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`ViTHybridConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +VIT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`ViTHybridImageProcessor.__call__`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.", + VIT_START_DOCSTRING, +) +class ViTHybridModel(ViTHybridPreTrainedModel): + def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True, use_mask_token: bool = False): + super().__init__(config) + self.config = config + + self.embeddings = ViTHybridEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = ViTHybridEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = ViTHybridPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> ViTHybridPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?) + expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype + if pixel_values.dtype != expected_dtype: + pixel_values = pixel_values.to(expected_dtype) + + embedding_output = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class ViTHybridPooler(nn.Module): + def __init__(self, config: ViTHybridConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@add_start_docstrings( + """ + ViT Hybrid Model transformer with an image classification head on top (a linear layer on top of the final hidden + state of the [CLS] token) e.g. for ImageNet. + """, + VIT_START_DOCSTRING, +) +class ViTHybridForImageClassification(ViTHybridPreTrainedModel): + def __init__(self, config: ViTHybridConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.vit = ViTHybridModel(config, add_pooling_layer=False) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.vit( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.classifier(sequence_output[:, 0, :]) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["ViTHybridForImageClassification", "ViTHybridModel", "ViTHybridPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c13c67012fa157a94bae21734a1f59b26c78588d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_xlm_prophetnet import * + from .modeling_xlm_prophetnet import * + from .tokenization_xlm_prophetnet import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py new file mode 100644 index 0000000000000000000000000000000000000000..59f42577c59f19d9d379eaaec15029eaed6967c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py @@ -0,0 +1,181 @@ +# coding=utf-8 +# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""XLM-ProphetNet model configuration""" + +from typing import Callable, Optional, Union + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class XLMProphetNetConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`XLMProphetNetModel`]. It is used to instantiate a + XLMProphetNet model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the XLMProphetNet + [microsoft/xprophetnet-large-wiki100-cased](https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + activation_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for activations inside the fully connected layer. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`XLMProphetNetModel`]. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + num_encoder_layers (`int`, *optional*, defaults to 12): + Number of encoder layers. + num_encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the `intermediate` (often named feed-forward) layer in decoder. + num_decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + num_decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + add_cross_attention (`bool`, *optional*, defaults to `True`): + Whether cross-attention layers should be added to the model. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether this is an encoder/decoder model. + pad_token_id (`int`, *optional*, defaults to 1) + Padding token id. + bos_token_id (`int`, *optional*, defaults to 0) + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2) + End of stream token id. + ngram (`int`, *optional*, defaults to 2) + Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first + token. + num_buckets (`int`, *optional*, defaults to 32) + The number of buckets to use for each attention layer. This is for relative position calculation. See the + [T5 paper](see https://huggingface.co/papers/1910.10683) for more details. + relative_max_distance (`int`, *optional*, defaults to 128) + Relative distances greater than this number will be put into the last same bucket. This is for relative + position calculation. See the [T5 paper](see https://huggingface.co/papers/1910.10683) for more details. + disable_ngram_loss (`bool`, *optional*, defaults to `False`): + Whether be trained predicting only the next first token. + eps (`float`, *optional*, defaults to 0.0): + Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label + smoothing is performed. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + """ + + model_type = "xlm-prophetnet" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_attention_heads": "num_encoder_attention_heads", + } + + def __init__( + self, + activation_dropout: Optional[float] = 0.1, + activation_function: Optional[Union[str, Callable]] = "gelu", + vocab_size: Optional[int] = 30522, + hidden_size: Optional[int] = 1024, + encoder_ffn_dim: Optional[int] = 4096, + num_encoder_layers: Optional[int] = 12, + num_encoder_attention_heads: Optional[int] = 16, + decoder_ffn_dim: Optional[int] = 4096, + num_decoder_layers: Optional[int] = 12, + num_decoder_attention_heads: Optional[int] = 16, + attention_dropout: Optional[float] = 0.1, + dropout: Optional[float] = 0.1, + max_position_embeddings: Optional[int] = 512, + init_std: Optional[float] = 0.02, + is_encoder_decoder: Optional[bool] = True, + add_cross_attention: Optional[bool] = True, + decoder_start_token_id: Optional[int] = 0, + ngram: Optional[int] = 2, + num_buckets: Optional[int] = 32, + relative_max_distance: Optional[int] = 128, + disable_ngram_loss: Optional[bool] = False, + eps: Optional[float] = 0.0, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.encoder_ffn_dim = encoder_ffn_dim + self.num_encoder_layers = num_encoder_layers + self.num_encoder_attention_heads = num_encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.num_decoder_layers = num_decoder_layers + self.num_decoder_attention_heads = num_decoder_attention_heads + self.max_position_embeddings = max_position_embeddings + self.init_std = init_std # Normal(0, this parameter) + self.activation_function = activation_function + + # parameters for xlmprophetnet + self.ngram = ngram + self.num_buckets = num_buckets + self.relative_max_distance = relative_max_distance + self.disable_ngram_loss = disable_ngram_loss + self.eps = eps + + # 3 Types of Dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.dropout = dropout + + self.use_cache = use_cache + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + add_cross_attention=add_cross_attention, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) + + @property + def num_hidden_layers(self) -> int: + return self.num_encoder_layers + self.num_decoder_layers + + @num_hidden_layers.setter + def num_hidden_layers(self, value): + raise NotImplementedError( + "This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and" + " `num_decoder_layers`." + ) + + +__all__ = ["XLMProphetNetConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py new file mode 100644 index 0000000000000000000000000000000000000000..36f6e6097bc3e55a153b991df9161b01c09c153a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -0,0 +1,2305 @@ +# coding=utf-8 +# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch XLM-ProphetNet model.""" + +import copy +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import Tensor, nn +from torch.nn import LayerNorm + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutput +from ....modeling_utils import PreTrainedModel +from ....utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ....utils.deprecation import deprecate_kwarg +from .configuration_xlm_prophetnet import XLMProphetNetConfig + + +logger = logging.get_logger(__name__) + + +_CONFIG_FOR_DOC = "XLMProphetNetConfig" + + +XLM_PROPHETNET_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + Original ProphetNet code can be found [here](https://github.com/microsoft/ProphetNet). Checkpoints were converted + from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the + file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`. + + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and + behavior. + + Parameters: + config ([`XLMProphetNetConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +XLM_PROPHETNET_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + XLMProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def softmax(hidden_state, dim, onnx_trace=False): + if onnx_trace: + return nn.functional.softmax(hidden_state.float(), dim=dim) + else: + return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32) + + +def ngram_attention_bias(sequence_length, ngram, device, dtype): + """ + This function computes the bias for the predict stream + """ + left_block = ( + torch.ones((ngram, sequence_length, sequence_length), device=device, dtype=dtype) * torch.finfo(dtype).min + ) + right_block = left_block.detach().clone() + # create bias + for stream_idx in range(ngram): + right_block[stream_idx].fill_diagonal_(0, wrap=False) + left_block[stream_idx].triu_(-stream_idx + 1) + + left_block[:, :, 0] = 0 + return torch.cat([left_block, right_block], dim=2) + + +def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False): + """ + This function computes individual parts of the relative position buckets. For more detail, see paper. + """ + inv_relative_positions = -relative_positions + rel_positions_bucket = 0 + + if is_bidirectional: + num_buckets = num_buckets // 2 + rel_positions_bucket = ( + rel_positions_bucket + + torch.lt(inv_relative_positions, torch.zeros_like(inv_relative_positions)).int() * num_buckets + ) + inv_relative_positions = torch.abs(inv_relative_positions) + else: + inv_relative_positions = torch.max(inv_relative_positions, torch.zeros_like(inv_relative_positions)) + + max_exact = num_buckets // 2 + is_small = torch.lt(inv_relative_positions, max_exact) + val_if_large = max_exact + torch.log(inv_relative_positions.float() / max_exact) / math.log( + max_distance / max_exact + ) * (num_buckets - max_exact) + val_if_large = torch.min(val_if_large, torch.ones_like(val_if_large) * (num_buckets - 1)).int() + rel_positions_bucket = rel_positions_bucket + torch.where(is_small, inv_relative_positions.int(), val_if_large) + return rel_positions_bucket + + +def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids): + """ + This function computes both main and predict relative position buckets. For more detail, see paper. + """ + # main stream + main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1) + main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1) + + # predicting stream + predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1) + predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1) + predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1) + + # get both position buckets + main_relative_position_buckets = compute_relative_buckets( + num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False + ) + predict_relative_position_buckets = compute_relative_buckets( + num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False + ) + return main_relative_position_buckets, predict_relative_position_buckets + + +@dataclass +class XLMProphetNetSeq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`): + Prediction scores of the main stream language modeling head (scores for each vocabulary token before + SoftMax). + logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`): + Prediction scores of the predict stream language modeling head (scores for each vocabulary token before + SoftMax). + past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, + num_attn_heads, decoder_sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, decoder_sequence_length, hidden_size)`. + + Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. + decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`. + + Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding + outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the + weighted average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + encoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to + compute the weighted average in the + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, encoder_sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention + softmax, used to compute the weighted average in the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + logits_ngram: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_ngram_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + decoder_ngram_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + + @property + def decoder_cross_attentions(self): + warnings.warn( + "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`" + " instead.", + FutureWarning, + ) + return self.cross_attentions + + +@dataclass +class XLMProphetNetSeq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`): + Sequence of main stream hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*): + Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model. + past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, + num_attn_heads, decoder_sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, decoder_sequence_length, hidden_size)`. + + Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. + decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`. + + Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding + outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the + weighted average in the + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + encoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to + compute the weighted average in the + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, encoder_sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + encoder_sequence_length, encoder_sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: torch.FloatTensor + last_hidden_state_ngram: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_ngram_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + decoder_ngram_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + + @property + def decoder_cross_attentions(self): + warnings.warn( + "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`" + " instead.", + FutureWarning, + ) + return self.cross_attentions + + +@dataclass +class XLMProphetNetDecoderModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`): + Sequence of main stream hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`): + Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model. + past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, + num_attn_heads, decoder_sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, decoder_sequence_length, hidden_size)`. + + Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. + ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`. + + Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding + outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the + weighted average in the + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + encoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to + compute the weighted average in the + """ + + last_hidden_state: torch.FloatTensor + last_hidden_state_ngram: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + hidden_states_ngram: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + ngram_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +class XLMProphetNetDecoderLMOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`): + Prediction scores of the main stream language modeling head (scores for each vocabulary token before + SoftMax). + logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`): + Prediction scores of the predict stream language modeling head (scores for each vocabulary token before + SoftMax). + past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, + num_attn_heads, decoder_sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, decoder_sequence_length, hidden_size)`. + + Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. + ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`. + + Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding + outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + decoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the + weighted average in the + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads, + encoder_sequence_length, decoder_sequence_length)`. + + Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to + compute the weighted average in the + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + logits_ngram: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + hidden_states_ngram: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + ngram_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + + +class XLMProphetNetPreTrainedModel(PreTrainedModel): + config: XLMProphetNetConfig + base_model_prefix = "prophetnet" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.init_std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.init_std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + assert decoder_start_token_id is not None, ( + "self.model.config.decoder_start_token_id has to be defined. In XLMProphetNet it is usually set to the" + " pad_token_id. See XLMProphetNet docs for more information" + ) + + # shift inputs to the right + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values" + + return shifted_input_ids + + +class XLMProphetNetPositionalEmbeddings(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting + based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to + the forward function. + """ + + def __init__(self, config: XLMProphetNetConfig) -> None: + self.max_length = config.max_position_embeddings + super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id) + + def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None): + assert (position_ids is None) or (self.padding_idx is None), ( + "If position_ids is pre-computed then padding_idx should not be set." + ) + + if position_ids is None: + if past_key_values is not None: + # position_ids is the same for every token when decoding a single step + # Without the int() cast, it doesn't work in some cases when exporting to ONNX + prev_num_input_ids = past_key_values.get_seq_length() + num_input_ids = inputs_shape[1] + prev_num_input_ids + position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * ( + int(self.padding_idx + num_input_ids) + ) + else: + if attention_mask is None: + attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device) + + # retrieve position_ids from input_ids / attention_mask + position_ids = ( + torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask + ).long() + self.padding_idx + + # make sure position_ids are not bigger then max_length + position_ids = position_ids.clamp(0, self.max_length - 1) + + return super().forward(position_ids), position_ids + + def _forward(self, position_ids): + return super().forward(position_ids) + + +class XLMProphetNetAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + config: XLMProphetNetConfig, + num_attn_heads: int, + ): + super().__init__() + hidden_size = config.hidden_size + + self.attention_dropout = config.attention_dropout + self.dropout = config.dropout + self.num_attn_heads = num_attn_heads + self.head_dim = hidden_size // num_attn_heads + + assert self.head_dim * num_attn_heads == hidden_size, ( + "`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and" + " `config.num_decoder_attention_heads`" + ) + + self.key_proj = nn.Linear(hidden_size, hidden_size) + self.value_proj = nn.Linear(hidden_size, hidden_size) + self.query_proj = nn.Linear(hidden_size, hidden_size) + + self.out_proj = nn.Linear(hidden_size, hidden_size) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + key_value_states: Optional[Tensor] = None, + attention_mask: Optional[Tensor] = None, + layer_head_mask: Optional[Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + ) -> tuple[Tensor, Optional[Tensor]]: + batch_size, tgt_len, hidden_size = hidden_states.size() + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + assert list(hidden_states.size()) == [ + batch_size, + tgt_len, + hidden_size, + ], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}" + + # previous time steps are cached - no need to recompute key and value if they are static + query_states = self.query_proj(hidden_states) / (self.head_dim**0.5) + + if is_cross_attention and past_key_values is not None: + # reuse k,v, cross_attentions + key_states = past_key_values[0] + value_states = past_key_values[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.key_proj(key_value_states), -1, batch_size) + value_states = self._shape(self.value_proj(key_value_states), -1, batch_size) + else: + # self_attention + key_states = self._shape(self.key_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.value_proj(hidden_states), -1, batch_size) + + if is_cross_attention: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_states, value_states) + + # project states into the correct shape + proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + src_len = key_states.size(2) + attn_weights = torch.einsum("bsij,bsjk->bsik", query_states, key_states.transpose(2, 3)) + expected_shape = (batch_size, self.num_attn_heads, tgt_len, src_len) + if attn_weights.size() != expected_shape: + raise ValueError(f"Attention weights should have size {expected_shape}, but is {attn_weights.size()}") + + # This is part of a workaround to get around fork/join parallelism not supporting Optional types. + if attention_mask is not None and attention_mask.dim() == 0: + attention_mask = None + + expected_shape = (batch_size, self.num_attn_heads, 1, src_len) + if attention_mask is not None and attention_mask.size() != expected_shape: + raise ValueError(f"Attention mask should have size {expected_shape}, but is {attention_mask.size()}") + if attention_mask is not None: # don't attend to padding symbols + attn_weights = attn_weights + attention_mask + if output_attentions: + attn_weights_reshaped = attn_weights + else: + attn_weights_reshaped = None + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + assert layer_head_mask.size() == (self.num_attn_heads,), ( + f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view( + batch_size, self.num_attn_heads, tgt_len, src_len + ) + + # apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model + attn_weights_reshaped = layer_head_mask.view(1, -1, 1, 1) * attn_weights_reshaped + + attn_probs = nn.functional.dropout( + attn_weights, + p=self.attention_dropout, + training=self.training, + ) + attn_output = torch.einsum("bsij,bsjk->bsik", attn_probs, value_states) + expected_shape = (batch_size, self.num_attn_heads, tgt_len, self.head_dim) + if attn_output.size() != expected_shape: + raise ValueError(f"`attn_output` should have shape {expected_shape}, but is of shape {attn_output.size()}") + + attn_output = attn_output.transpose(1, 2).reshape(batch_size, tgt_len, hidden_size) + attn_output = self.out_proj(attn_output) + + attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training) + return attn_output, attn_weights_reshaped, past_key_values + + +class XLMProphetNetFeedForward(nn.Module): + """ + This is the residual two feed-forward layer block based on the original Transformer implementation. + """ + + def __init__(self, config: XLMProphetNetConfig, ffn_dim: int): + super().__init__() + self.activation_fn = ACT2FN[config.activation_function] + self.intermediate = nn.Linear(config.hidden_size, ffn_dim) + self.output = nn.Linear(ffn_dim, config.hidden_size) + self.activation_dropout = config.activation_dropout + self.dropout = config.dropout + + def forward(self, hidden_states): + hidden_states = self.intermediate(hidden_states) + hidden_states = self.activation_fn(hidden_states) + + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.output(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + return hidden_states + + +class XLMProphetNetNgramSelfAttention(nn.Module): + def __init__(self, config: XLMProphetNetConfig): + super().__init__() + self.hidden_size = config.hidden_size + + self.num_buckets = config.num_buckets + self.relative_max_distance = config.relative_max_distance + self.num_attn_heads = config.num_decoder_attention_heads + self.dropout = config.dropout + self.attention_dropout = config.attention_dropout + self.head_dim = config.hidden_size // self.num_attn_heads + self.ngram = config.ngram + + assert self.head_dim * self.num_attn_heads == config.hidden_size, ( + "config.hidden_size must be divisible by num_attn_heads" + ) + # key, value, query projection + self.key_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.value_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.query_proj = nn.Linear(config.hidden_size, config.hidden_size) + + # out projection + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) + + # rel position embeddings + self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads) + + # for onnx runtime + self.onnx_trace = False + + def _shape(self, tensor, seq_len, batch_size): + return tensor.view(batch_size, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous() + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + past_key_values: Optional[Cache] = None, + attention_mask=None, + layer_head_mask=None, + extended_predict_attention_mask=None, + main_relative_position_buckets=None, + predict_relative_position_buckets=None, + position_ids=None, + ): + batch_size, ngram_sequence_length, hidden_size = hidden_states.size() + assert list(hidden_states.size()) == [batch_size, ngram_sequence_length, hidden_size], ( + f"`hidden_states` should be of shape {batch_size, ngram_sequence_length, hidden_size}, but is of shape" + f" {hidden_states.shape}" + ) + + # project + query_states = self.query_proj(hidden_states) + key_states = self.key_proj(hidden_states) + value_states = self.value_proj(hidden_states) + + # normalize + query_states = query_states / (self.head_dim**0.5) + + # reshape + query_states = self._shape(query_states, ngram_sequence_length, batch_size) + key_states = self._shape(key_states, -1, batch_size) + value_states = self._shape(value_states, -1, batch_size) + proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim) + + query_states = query_states.view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + # chunk into main stream and predict stream + hidden_states_list = hidden_states.chunk(1 + self.ngram, dim=1) + query_states_list = query_states.chunk(1 + self.ngram, dim=2) + key_states_list = key_states.chunk(1 + self.ngram, dim=2) + value_states_list = value_states.chunk(1 + self.ngram, dim=2) + + main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:] + main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:] + main_key_states, predict_key_states_list = key_states_list[0], key_states_list[1:] + main_value_states, predict_value_states_list = value_states_list[0], value_states_list[1:] + + # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim) + if past_key_values is not None: + prev_main_key_states = past_key_values[0] + main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=2) + prev_main_value_states = past_key_values[1] + main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=2) + + # Update cache + past_key_values = (main_key_states, main_value_states) + + # get seq_length of main stream only + sequence_length = ngram_sequence_length // (1 + self.ngram) + + # MAIN-STREAM + # main attn weights + # [batch_size, number_heads, sequence_length, head_dimesion] + # x [batch_size, number_heads, head_dimesion, sequence_length] + # -> [batch_size, number_heads, sequence_length, sequence_length] + main_attn_weights = torch.einsum("bntc,bncs->bnts", main_query_states, main_key_states.transpose(2, 3)) + + # retrieve relative position embeddings for each layer -> see paper for more details + main_relative_pos_embeddings = self.get_main_relative_pos_embeddings( + main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets + ) + + main_attn_weights = main_attn_weights + main_relative_pos_embeddings + + if attention_mask is not None: + main_attn_weights = main_attn_weights + attention_mask + + main_attn_probs = softmax( + main_attn_weights, + dim=-1, + onnx_trace=self.onnx_trace, + ).type_as(main_attn_weights) + + if layer_head_mask is not None: + assert layer_head_mask.size() == (self.num_attn_heads,), ( + f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + main_attn_probs = layer_head_mask.view(1, -1, 1, 1) * main_attn_probs.view( + batch_size, self.num_attn_heads, -1, sequence_length + ) + + main_attn_probs = nn.functional.dropout(main_attn_probs, p=self.attention_dropout, training=self.training) + # project to attn_output + # [batch_size, number_heads, sequence_length, sequence_length] + # x [batch_size, number_heads, sequence_length, head_dimesion] + # -> [batch_size, number_heads, sequence_length, head_dimesion] + main_attn_output = torch.einsum("bntc,bncs->bnts", main_attn_probs, main_value_states) + # reshape so that num_heads dim is merged into last `head_dim` axis + main_attn_output = main_attn_output.transpose(1, 2).reshape(batch_size, 1, sequence_length, hidden_size) + main_attn_output = self.out_proj(main_attn_output) + + # PREDICT-STREAM + # [batch_size, ngram, number_heads, sequence_length, head_dimesion] + predict_query_states = torch.stack(predict_query_states_list, 1).view( + batch_size, self.ngram, self.num_attn_heads, sequence_length, self.head_dim + ) + + # [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion] + predict_key_states = torch.stack([torch.cat([main_key_states, key], 2) for key in predict_key_states_list], 1) + + # [batch_size, sequence_length, ngram, hidden_size] + predict_hidden_states = torch.stack(hidden_states_predict_list, dim=2) + + # [batch_size, number_heads, ngram, 2*sequence_length, head_dimesion] + predict_value_states = torch.cat( + [torch.cat([main_value_states, v_p], 2).unsqueeze(2) for v_p in predict_value_states_list], 2 + ) + + # [batch_size, ngram, number_heads, sequence_length, head_dimesion] + # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion] + # -> [batch_size, ngram, number_heads, sequence_length, 2*sequence_length] + predict_attn_weights = torch.einsum("bnhtc,bnhsc->bnhts", (predict_query_states, predict_key_states)) + + # retrieve relative position embeddings for each layer -> see paper for more details + # [batch_size, ngram, number_heads, sequence_length, predict_relative_pos_embeddings] + predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings( + predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets + ) + + # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length] + predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings + + if extended_predict_attention_mask is not None: + # Permuting Predict attention mask to [batch_size, ngram, number_heads, sequence_length, 2*sequence_length] + extended_predict_attention_mask = extended_predict_attention_mask.permute(0, 2, 1, 3, 4) + extended_predict_attention_mask = extended_predict_attention_mask.to(predict_attn_weights.dtype) + predict_attn_weights = predict_attn_weights + extended_predict_attention_mask + + predict_attn_probs = softmax( + predict_attn_weights, + dim=-1, + onnx_trace=self.onnx_trace, + ).type_as(predict_attn_weights) + + if layer_head_mask is not None: + assert layer_head_mask.size() == (self.num_attn_heads,), ( + f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + predict_attn_probs = layer_head_mask.view(1, 1, -1, 1, 1) * predict_attn_probs + + predict_attn_probs = nn.functional.dropout( + predict_attn_probs, p=self.attention_dropout, training=self.training + ) + # project to attention output + # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length] + # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion] + # -> [batch_size, ngram, number_heads, sequence_length, head_dimesion] + predict_attn_output = torch.einsum( + "bnhts,bnhsc->bnhtc", (predict_attn_probs, predict_value_states.transpose(1, 2)) + ) + + # reshape so that num_heads dim is merged into last `head_dim` axis + # [batch_size, ngram, number_heads, sequence_length, head_dimesion] -> [batch_size, ngram, sequence_length, hidden_size] + predict_attn_output = predict_attn_output.transpose(2, 3) + predict_attn_output = predict_attn_output.reshape(batch_size, self.ngram, sequence_length, hidden_size) + predict_attn_output = self.out_proj(predict_attn_output) + + # concat to single attn output + # [batch_size, (1+ngram)*sequence_length, hidden_size] + attn_output = torch.cat([main_attn_output, predict_attn_output], 1).view(batch_size, -1, hidden_size) + # reshape into better form for `config.output_attentions` + main_attn_probs = main_attn_probs.view(batch_size, self.num_attn_heads, sequence_length, -1) + + attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training) + + return attn_output, main_attn_probs, predict_attn_probs, past_key_values + + def get_main_relative_pos_embeddings( + self, hidden_states, attn_weights, position_ids, main_relative_position_buckets + ): + # input hidden_states [batch_size, sequence_length, hidden_size] + # input attn_weights [batch_size, num_heads, sequence_length, sequence_length] + # input position_ids [batch_size, sequence_length] or [1,1] + batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape + attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len) + if main_relative_position_buckets is None: + batch_size, sequence_length = hidden_states.shape[:2] + relative_positions = ( + torch.arange(1, attn_weights.shape[-1] + 1) + .unsqueeze(0) + .unsqueeze(0) + .repeat(batch_size, sequence_length, 1) + .to(position_ids.device) + ) + # [batch_size, sequence_length, sequence_length+1] + relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1) + main_relative_position_buckets = compute_relative_buckets( + self.num_buckets, self.relative_max_distance, relative_positions, False + ) + + # [batch_size, sequence_length, num_buckets * num_heads] + rel_pos_embeddings = self.relative_pos_embeddings(hidden_states) + rel_pos_embeddings = rel_pos_embeddings.view( + rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads) + ) + rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2) + # [batch_size, num_heads, sequence_length, num_buckets] + rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,)) + + main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1) + # [batch_size * num_heads * sequence_length, sequence_length] + main_relative_position_buckets = main_relative_position_buckets.view( + -1, main_relative_position_buckets.shape[-1] + ) + main_relative_position_buckets = main_relative_position_buckets.long() + # [batch_size * num_heads * sequence_length, sequence_length] + rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1)) + + main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets) + main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1) + return main_relative_pos_embeddings + + def get_predict_relative_pos_embeddings( + self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets + ): + # input hidden_states [batch_size, sequence_length, ngram, hidden_size] + # input attn_weights [batch_size, ngram, num_heads, sequence_length, 2*sequence_length] + # input position_ids [batch_size, sequence_length] or [1,1] + # input predict_relative_position_buckets [batch_size, sequence_length, 2*sequence_length] or None + batch_size, sequence_length = hidden_states.shape[0:2] + + if predict_relative_position_buckets is None: + key_sequence_length = attn_weights.shape[-1] + assert position_ids[0][0] == key_sequence_length - 1, ( + "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)" + ) + relative_positions = ( + torch.arange(0, key_sequence_length) + .unsqueeze(0) + .unsqueeze(0) + .repeat(batch_size, sequence_length, 1) + .to(position_ids.device) + ) + + relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1) + predict_relative_position_buckets = compute_relative_buckets( + self.num_buckets, self.relative_max_distance, relative_positions, False + ) + + # [batch_size, ngram, sequence_length, hidden_size] + hidden_states = hidden_states.transpose(1, 2) + rel_pos_embeddings = self.relative_pos_embeddings(hidden_states) + + # [batch_size, ngram, sequence_length, num_buckets, num_heads] + rel_pos_embeddings = rel_pos_embeddings.view( + hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads) + ) + rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3) + # [batch_size * ngram * sequence_length * num_heads, num_buckets] + rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets) + # [ngram, batch_size, num_heads * sequence_length, -1] + predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0) + predict_relative_position_buckets = predict_relative_position_buckets.repeat( + self.ngram, 1, self.num_attn_heads, 1 + ) + # [ngram * batch_size * num_heads * sequence_length, -1] + predict_relative_position_buckets = predict_relative_position_buckets.view( + -1, predict_relative_position_buckets.size(-1) + ).long() + + predict_relative_pos_embeddings = torch.gather( + rel_pos_embeddings, dim=1, index=predict_relative_position_buckets + ) + + # [batch_size, gram, num_heads, sequence_length, -1] + predict_relative_pos_embeddings = predict_relative_pos_embeddings.view( + batch_size, self.ngram, self.num_attn_heads, sequence_length, -1 + ) + + return predict_relative_pos_embeddings + + +class XLMProphetNetEncoderLayer(GradientCheckpointingLayer): + """ + Encoder block for XLMProphetnet + """ + + def __init__(self, config: XLMProphetNetConfig): + super().__init__() + # 1st residual block + self.self_attn = XLMProphetNetAttention(config, config.num_encoder_attention_heads) + self.self_attn_layer_norm = LayerNorm(config.hidden_size) + + # 2nd residual block + self.feed_forward = XLMProphetNetFeedForward(config, config.encoder_ffn_dim) + self.feed_forward_layer_norm = LayerNorm(config.hidden_size) + + def forward( + self, + hidden_states, + attention_mask, + layer_head_mask, + output_attentions: bool = False, + ): + # 1st residual block + attention_output, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_layer_norm(attention_output + hidden_states) + + # 2nd residual block + feed_forward_output = self.feed_forward(hidden_states) + hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class XLMProphetNetDecoderLayer(GradientCheckpointingLayer): + """ + Decoder block for XLMProphetnet + """ + + def __init__(self, config: XLMProphetNetConfig): + super().__init__() + # 1st residual block + self.self_attn = XLMProphetNetNgramSelfAttention(config) + self.self_attn_layer_norm = LayerNorm(config.hidden_size) + + # 2nd residual block + if config.add_cross_attention: + self.cross_attn = XLMProphetNetAttention(config, config.num_decoder_attention_heads) + self.cross_attn_layer_norm = LayerNorm(config.hidden_size) + + # 3rd residual block + self.feed_forward = XLMProphetNetFeedForward(config, config.decoder_ffn_dim) + self.feed_forward_layer_norm = LayerNorm(config.hidden_size) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attn_mask=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + extended_predict_attention_mask=None, + main_relative_position_buckets=None, + predict_relative_position_buckets=None, + position_ids=None, + past_key_values=None, + use_cache: bool = True, + output_attentions: bool = False, + ): + # 1st residual block + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None + ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_values=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + extended_predict_attention_mask=extended_predict_attention_mask, + main_relative_position_buckets=main_relative_position_buckets, + predict_relative_position_buckets=predict_relative_position_buckets, + position_ids=position_ids, + ) + hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output) + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None + cross_attn_weights = None + if encoder_hidden_states is not None: + # 2nd residual block + attention_output, cross_attn_weights, cross_attn_present_key_value = self.cross_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attn_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_values=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # 3rd residual block + feed_forward_output = self.feed_forward(hidden_states) + hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, self_attn_weights_ngram, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +@add_start_docstrings( + "The standalone encoder part of the XLMProphetNetModel.", + XLM_PROPHETNET_START_DOCSTRING, +) +class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel): + r""" + word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*): + The word embedding parameters. This can be used to initialize [`XLMProphetNetEncoder`] with pre-defined word + embeddings instead of randomly initialized word embeddings. + """ + + def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None): + super().__init__(config) + + self.word_embeddings = ( + word_embeddings + if word_embeddings is not None + else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + ) + self.position_embeddings = XLMProphetNetPositionalEmbeddings(config) + self.embeddings_layer_norm = LayerNorm(config.hidden_size) + + self.layers = nn.ModuleList([XLMProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)]) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, value): + self.word_embeddings = value + + @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, XLMProphetNetEncoder + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + >>> model = XLMProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone") + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None and inputs_embeds is None: + raise ValueError("Either input_ids or inputs_embeds has to be passed.") + elif input_ids is not None and inputs_embeds is not None: + raise ValueError("Make sure to only pass input_ids or inputs_embeds.") + elif input_ids is not None and inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + # prepare attention mask + if attention_mask is not None: + extended_attention_mask = ( + 1.0 - attention_mask[:, None, None, :].repeat(1, self.config.num_encoder_attention_heads, 1, 1) + ) * torch.finfo(self.dtype).min + extended_attention_mask = extended_attention_mask.to(inputs_embeds.dtype) + else: + extended_attention_mask = None + + position_embeddings, position_ids = self.position_embeddings(inputs_embeds.shape[:2], inputs_embeds.device) + + hidden_states = inputs_embeds + position_embeddings + hidden_states = self.embeddings_layer_norm(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.config.dropout, training=self.training) + + encoder_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == (len(self.layers)), ( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + ) + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_hidden_states = encoder_hidden_states + (hidden_states,) + + layer_outputs = encoder_layer( + hidden_states, + attention_mask=extended_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_hidden_states = encoder_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_hidden_states, attentions=all_attentions + ) + + +@add_start_docstrings( + "The standalone decoder part of the XLMProphetNetModel.", + XLM_PROPHETNET_START_DOCSTRING, +) +class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel): + r""" + word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*): + The word embedding parameters. This can be used to initialize [`XLMProphetNetEncoder`] with pre-defined word + embeddings instead of randomly initialized word embeddings. + """ + + def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None): + super().__init__(config) + + self.ngram = config.ngram + self.num_buckets = config.num_buckets + self.relative_max_distance = config.relative_max_distance + self.dropout = config.dropout + self.max_target_positions = config.max_position_embeddings + + self.word_embeddings = ( + word_embeddings + if word_embeddings is not None + else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + ) + self.position_embeddings = XLMProphetNetPositionalEmbeddings(config) + + self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None) + self.layers = nn.ModuleList([XLMProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)]) + self.embeddings_layer_norm = LayerNorm(config.hidden_size) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, value): + self.word_embeddings = value + + @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=XLMProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, XLMProphetNetDecoderModelOutput]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, XLMProphetNetDecoder + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + >>> model = XLMProphetNetDecoder.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone", add_cross_attention=False) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None and inputs_embeds is None: + raise ValueError("Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.") + elif input_ids is not None and inputs_embeds is not None: + raise ValueError("Make sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.") + elif input_ids is not None and inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + batch_size, sequence_length = inputs_embeds.shape[:2] + + main_stream_pos_embed, position_ids = self.position_embeddings( + (batch_size, sequence_length), + device=inputs_embeds.device, + past_key_values=past_key_values, + ) + + if past_key_values is not None: + main_relative_position_buckets, predict_relative_position_buckets = None, None + else: + ( + main_relative_position_buckets, + predict_relative_position_buckets, + ) = self.compute_buffered_relative_buckets(position_ids) + predicting_stream_pos_embed = self.position_embeddings._forward(position_ids + 1) + + # add position embeddings + hidden_states = inputs_embeds + main_stream_pos_embed + + ngram_embeddings = self.ngram_embeddings.weight + + # prepare attention mask + if past_key_values is not None: + assert hidden_states.size(1) == 1, ( + "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1" + ) + + ngram_hidden_states = [ + (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1) + for ngram in range(self.ngram) + ] + extended_attention_mask = None + extended_predict_attention_mask = None + else: + ngram_hidden_states = [ + (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed) for ngram in range(self.ngram) + ] + extended_attention_mask = self.prepare_attention_mask(hidden_states, attention_mask) + extended_predict_attention_mask = self.prepare_predict_attention_mask(hidden_states, attention_mask) + + # prepare encoder attention mask + if encoder_attention_mask is not None: + extended_encoder_attention_mask = ( + 1.0 - encoder_attention_mask[:, None, None, :].repeat(1, self.config.num_decoder_attention_heads, 1, 1) + ) * torch.finfo(self.dtype).min + extended_encoder_attention_mask = extended_encoder_attention_mask.to(inputs_embeds.dtype) + else: + extended_encoder_attention_mask = None + + hidden_states = torch.cat([hidden_states] + ngram_hidden_states, 1) + + if self.embeddings_layer_norm: + hidden_states = self.embeddings_layer_norm(hidden_states) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # init attentions, hidden_states and cache with empty tuples + all_main_stream_hidden_states = () if output_hidden_states else None + all_ngram_stream_hidden_states = () if output_hidden_states and self.config.ngram > 0 else None + + all_main_stream_attns = () if output_attentions else None + all_ngram_stream_attns = () if output_attentions else None + all_cross_attns = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + present_key_values = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + assert attn_mask.size()[0] == (len(self.layers)), ( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + # grad cannot be kept because tensor is sliced + all_main_stream_hidden_states += (hidden_states[:, :sequence_length],) + if self.config.ngram > 0: + all_ngram_stream_hidden_states += (hidden_states[:, sequence_length:],) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=extended_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attn_mask=extended_encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None), + extended_predict_attention_mask=extended_predict_attention_mask, + main_relative_position_buckets=main_relative_position_buckets, + predict_relative_position_buckets=predict_relative_position_buckets, + position_ids=position_ids, + past_key_values=past_key_values[idx] if past_key_values is not None else None, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + present_key_values += (layer_outputs[4 if output_attentions else 1],) + + if output_attentions: + all_main_stream_attns += (layer_outputs[1],) + all_ngram_stream_attns += (layer_outputs[2],) + + if self.config.add_cross_attention: + all_cross_attns += (layer_outputs[3],) + + if output_hidden_states: + all_main_stream_hidden_states += (hidden_states[:, :sequence_length],) + if self.config.ngram > 0: + all_ngram_stream_hidden_states += (hidden_states[:, sequence_length:],) + + # split last_hidden_state for return + last_hidden_state = hidden_states[:, :sequence_length] + last_hidden_state_ngram = hidden_states[:, sequence_length:] if self.config.ngram > 0 else None + + if not return_dict: + return tuple( + v + for v in [ + last_hidden_state, + last_hidden_state_ngram, + present_key_values, + all_main_stream_hidden_states, + all_ngram_stream_hidden_states, + all_main_stream_attns, + all_ngram_stream_attns, + all_cross_attns, + ] + if v is not None + ) + return XLMProphetNetDecoderModelOutput( + last_hidden_state=last_hidden_state, + last_hidden_state_ngram=last_hidden_state_ngram, + past_key_values=present_key_values, + hidden_states=all_main_stream_hidden_states, + hidden_states_ngram=all_ngram_stream_hidden_states, + attentions=all_main_stream_attns, + ngram_attentions=all_ngram_stream_attns, + cross_attentions=all_cross_attns, + ) + + def compute_buffered_relative_buckets(self, position_ids): + batch_size, sequence_length = position_ids.shape + + position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(1, 1) + main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets( + self.num_buckets, self.relative_max_distance, position_ids + ) + + # buffer relative buckets + main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1) + predict_relative_buckets = torch.cat( + [ + predict_relative_buckets[:, :sequence_length, :sequence_length], + predict_relative_buckets[ + :, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length + ], + ], + 2, + ).repeat(batch_size, 1, 1) + + return main_relative_buckets, predict_relative_buckets + + def prepare_attention_mask(self, hidden_states, attention_mask): + batch_size, seq_length = hidden_states.shape[:2] + + # get causal mask + causal_mask = torch.full( + (seq_length, seq_length), + torch.finfo(hidden_states.dtype).min, + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + causal_mask = torch.triu(causal_mask, 1) + + extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand( + (batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape + ) + + # add usual attention mask + if attention_mask is not None: + extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(self.dtype).min + extended_attention_mask = extended_causal_mask + extended_attention_mask + else: + extended_attention_mask = extended_causal_mask + return extended_attention_mask.to(hidden_states.dtype) + + def prepare_predict_attention_mask(self, hidden_states, attention_mask): + batch_size, seq_length = hidden_states.shape[:2] + + # get causal mask + predict_causal_mask = ngram_attention_bias( + self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype + ) + predict_causal_mask = torch.cat( + [ + predict_causal_mask[:, :seq_length, :seq_length], + predict_causal_mask[ + :, :seq_length, self.max_target_positions : self.max_target_positions + seq_length + ], + ], + dim=-1, + ) + extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand( + (batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape + ) + + # add usual attention mask + if attention_mask is not None: + extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min + extended_attention_mask = extended_attention_mask.expand( + (batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length) + ) + # predicted stream attention_mask should always be 0 + extended_attention_mask = torch.cat( + [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1 + ) + extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask + else: + extended_predict_attention_mask = extended_predict_causal_mask + return extended_predict_attention_mask.to(hidden_states.dtype) + + +@add_start_docstrings( + "The bare XLMProphetNet Model outputting raw hidden-states without any specific head on top.", + XLM_PROPHETNET_START_DOCSTRING, +) +class XLMProphetNetModel(XLMProphetNetPreTrainedModel): + _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"] + + def __init__(self, config: XLMProphetNetConfig): + super().__init__(config) + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + + encoder_config = copy.deepcopy(config) + encoder_config.is_encoder_decoder = False + encoder_config.use_cache = False + self.encoder = XLMProphetNetEncoder(encoder_config, self.word_embeddings) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + decoder_config.is_encoder_decoder = False + self.decoder = XLMProphetNetDecoder(decoder_config, self.word_embeddings) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, value): + self.word_embeddings = value + self.encoder.word_embeddings = self.word_embeddings + self.decoder.word_embeddings = self.word_embeddings + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.word_embeddings, self.word_embeddings) + self._tie_or_clone_weights(self.decoder.word_embeddings, self.word_embeddings) + + def get_encoder(self): + return self.encoder + + @add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=XLMProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, XLMProphetNetSeq2SeqModelOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, XLMProphetNetModel + + >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + >>> model = XLMProphetNetModel.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + + >>> input_ids = tokenizer( + ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" + ... ).input_ids # Batch size 1 + >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + + >>> last_hidden_states = outputs.last_hidden_state # main stream hidden states + >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram # predict hidden states + ```""" + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + return XLMProphetNetSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + last_hidden_state_ngram=decoder_outputs.last_hidden_state_ngram, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_ngram_hidden_states=decoder_outputs.hidden_states_ngram, + decoder_attentions=decoder_outputs.attentions, + decoder_ngram_attentions=decoder_outputs.ngram_attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The XLMProphetNet Model with a language modeling head. Can be used for sequence generation tasks.", + XLM_PROPHETNET_START_DOCSTRING, +) +class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): + _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"] + + def __init__(self, config: XLMProphetNetConfig): + super().__init__(config) + self.prophetnet = XLMProphetNetModel(config) + self.padding_idx = config.pad_token_id + self.disable_ngram_loss = config.disable_ngram_loss + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head) + + def get_input_embeddings(self): + return self.prophetnet.word_embeddings + + @add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=XLMProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, XLMProphetNetSeq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., + config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for + labels in `[0, ..., config.vocab_size]` + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, XLMProphetNetForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + >>> model = XLMProphetNetForConditionalGeneration.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + + >>> input_ids = tokenizer( + ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" + ... ).input_ids # Batch size 1 + >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + + >>> logits_next_token = outputs.logits # logits to predict next token as usual + >>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + + outputs = self.prophetnet( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + batch_size, sequence_length = ( + decoder_input_ids.shape if decoder_input_ids is not None else decoder_inputs_embeds.shape[:2] + ) + + predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1) + predict_logits = self.lm_head(predicting_streams) + + logits = predict_logits[:, 0] + logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None + + # To use .view in loss computation, make sure that logits is contiguous. + if not logits.is_contiguous(): + logits = logits.contiguous() + + loss = None + if labels is not None: + loss = self._compute_loss(predict_logits, labels) + + if not return_dict: + all_logits = tuple(v for v in [logits, logits_ngram] if v is not None) + return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:] + else: + return XLMProphetNetSeq2SeqLMOutput( + loss=loss, + logits=logits, + logits_ngram=logits_ngram, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_ngram_hidden_states=outputs.decoder_ngram_hidden_states, + decoder_attentions=outputs.decoder_attentions, + decoder_ngram_attentions=outputs.decoder_ngram_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def _compute_loss(self, logits, labels, ignore_index=-100): + expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index) + + for i in range(self.config.ngram): + if i > 0 and self.disable_ngram_loss: + break + expend_targets[i, :, :] = labels + + logits = logits.transpose(0, 1).contiguous() + lprobs = nn.functional.log_softmax( + logits.view(-1, logits.size(-1)), + dim=-1, + dtype=torch.float32, + ) + + loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") + + if self.config.eps > 0.0: + smooth_loss = -lprobs.sum(dim=-1, keepdim=True) + non_masked_tokens = expend_targets.ne(ignore_index).view(-1) + smooth_loss = smooth_loss[non_masked_tokens] + smooth_loss = smooth_loss.mean() + + eps_i = self.config.eps / lprobs.size(-1) + loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss + + return loss + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation." + + if past_key_values: + decoder_input_ids = decoder_input_ids[:, -1:] + # first step, decoder_cached_states are empty + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return self._shift_right(labels) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + + layer_past[2:], + ) + return reordered_past + + def get_encoder(self): + return self.prophetnet.encoder + + def get_decoder(self): + return self.prophetnet.decoder + + +@add_start_docstrings( + "The standalone decoder part of the XLMProphetNetModel with a lm head on top. The model can be used for causal" + " language modeling.", + XLM_PROPHETNET_START_DOCSTRING, +) +class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel): + _tied_weights_keys = [ + "prophetnet.word_embeddings.weight", + "prophetnet.decoder.word_embeddings.weight", + "lm_head.weight", + ] + + def __init__(self, config: XLMProphetNetConfig): + # set config for CLM + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.prophetnet = XLMProphetNetDecoderWrapper(config) + + self.padding_idx = config.pad_token_id + self.disable_ngram_loss = config.disable_ngram_loss + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.prophetnet.decoder.word_embeddings + + def set_input_embeddings(self, value): + self.prophetnet.decoder.word_embeddings = value + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head) + + def set_decoder(self, decoder): + self.prophetnet.decoder = decoder + + def get_decoder(self): + return self.prophetnet.decoder + + @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=XLMProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, XLMProphetNetDecoderLMOutput]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, XLMProphetNetForCausalLM + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + >>> model = XLMProphetNetForCausalLM.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + + >>> # Model can also be used with EncoderDecoder framework + >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer + >>> import torch + + >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased") + >>> tokenizer_dec = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone") + >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained( + ... "google-bert/bert-large-uncased", "patrickvonplaten/xprophetnet-large-uncased-standalone" + ... ) + + >>> ARTICLE = ( + ... "the us state department said wednesday it had received no " + ... "formal word from bolivia that it was expelling the us ambassador there " + ... "but said the charges made against him are `` baseless ." + ... ) + >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids + >>> labels = tokenizer_dec( + ... "us rejects charges against its ambassador in bolivia", return_tensors="pt" + ... ).input_ids + >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:]) + + >>> loss = outputs.loss + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + outputs = self.prophetnet.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + batch_size, sequence_length = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2] + + predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1) + predict_logits = self.lm_head(predicting_streams) + + logits = predict_logits[:, 0] + logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None + + loss = None + if labels is not None: + loss = self._compute_loss(predict_logits, labels) + + if not return_dict: + all_logits = tuple(v for v in [logits, logits_ngram] if v is not None) + return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:] + else: + return XLMProphetNetDecoderLMOutput( + loss=loss, + logits=logits, + logits_ngram=logits_ngram, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + hidden_states_ngram=outputs.hidden_states_ngram, + attentions=outputs.attentions, + ngram_attentions=outputs.ngram_attentions, + cross_attentions=outputs.cross_attentions, + ) + + def _compute_loss(self, logits, labels, ignore_index=-100): + expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index) + + for i in range(self.config.ngram): + if i > 0 and self.disable_ngram_loss: + break + expend_targets[i, :, :] = labels + + logits = logits.transpose(0, 1).contiguous() + lprobs = nn.functional.log_softmax( + logits.view(-1, logits.size(-1)), + dim=-1, + dtype=torch.float32, + ) + + loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") + + if self.config.eps > 0.0: + smooth_loss = -lprobs.sum(dim=-1, keepdim=True) + non_masked_tokens = expend_targets.ne(ignore_index).view(-1) + smooth_loss = smooth_loss[non_masked_tokens] + smooth_loss = smooth_loss.mean() + + eps_i = self.config.eps / lprobs.size(-1) + loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss + + return loss + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + head_mask=None, + use_cache=None, + **kwargs, + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + if past_key_values: + input_ids = input_ids[:, -1:] + # first step, decoder_cached_states are empty + return { + "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed + "attention_mask": attention_mask, + "head_mask": head_mask, + "past_key_values": past_key_values, + "use_cache": use_cache, + } + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +class XLMProphetNetDecoderWrapper(XLMProphetNetPreTrainedModel): + """ + This is a wrapper class, so that [`XLMProphetNetForCausalLM`] can correctly be loaded from pretrained XLMProphetNet + classes. + """ + + def __init__(self, config: XLMProphetNetConfig): + super().__init__(config) + + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.decoder = XLMProphetNetDecoder(config, word_embeddings=self.word_embeddings) + + # Initialize weights and apply final processing + self.post_init() + + def _tie_weights(self): + self._tie_or_clone_weights(self.word_embeddings, self.decoder.get_input_embeddings()) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +__all__ = [ + "XLMProphetNetDecoder", + "XLMProphetNetEncoder", + "XLMProphetNetForCausalLM", + "XLMProphetNetForConditionalGeneration", + "XLMProphetNetModel", + "XLMProphetNetPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py new file mode 100644 index 0000000000000000000000000000000000000000..77431b13c49f47ac9df31b3d4e76ac9e2060895d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py @@ -0,0 +1,322 @@ +# coding=utf-8 +# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +from shutil import copyfile +from typing import Any, Optional + +from ....tokenization_utils import PreTrainedTokenizer +from ....utils import logging + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +class XLMProphetNetTokenizer(PreTrainedTokenizer): + """ + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `"[SEP]"`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="[SEP]", + eos_token="[SEP]", + sep_token="[SEP]", + unk_token="[UNK]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + try: + import sentencepiece as spm + except ImportError: + logger.warning( + "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" + " pip install sentencepiece" + ) + raise + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # Original fairseq vocab and spm vocab must be "aligned": + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- + # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' + # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + + # put special tokens and [unused] tokens into the vocab + self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4} + + for i in range(10): + tok = f"[unused{i}]" + self.fairseq_tokens_to_ids[tok] = 5 + i + + # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab + self.fairseq_offset = 12 + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + # TODO ArthurZ fairseq_ids_to_tokens should be removed + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + unk_token=unk_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + try: + import sentencepiece as spm + except ImportError: + logger.warning( + "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" + " pip install sentencepiece" + ) + raise + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return ([0] * len(token_ids_0)) + [1] + return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLMProphetNet + does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + sep) * [0] + return len(token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.sp_model) + self.fairseq_offset + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> str: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A XLMProphetNet sequence has the following format: + + - single sequence: `X [SEP]` + - pair of sequences: `A [SEP] B [SEP]` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return token_ids_0 + [self.sep_token_id] + sep = [self.sep_token_id] + return token_ids_0 + sep + token_ids_1 + sep + + +__all__ = ["XLMProphetNetTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..47f224f248bd1c222823c9ba2159999b68c641e1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py @@ -0,0 +1,389 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for DepthPro.""" + +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np + +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from .modeling_depth_pro import DepthProDepthEstimatorOutput + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + is_torch_available, + make_flat_list_of_images, + to_numpy_array, + valid_images, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_torchvision_available, logging, requires_backends + + +if is_torch_available(): + import torch + +if is_torchvision_available(): + from ...image_utils import pil_torch_interpolation_mapping + + +logger = logging.get_logger(__name__) + + +@requires(backends=("torchvision", "torch")) +class DepthProImageProcessor(BaseImageProcessor): + r""" + Constructs a DepthPro image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + **kwargs, + ): + super().__init__(**kwargs) + size = size if size is not None else {"height": 1536, "width": 1536} + size = get_size_dict(size) + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized images. + """ + requires_backends(self, "torch") + + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + + # we use torch interpolation instead of image.resize because DepthProImageProcessor + # rescales, then normalizes, which may cause some values to become negative, before resizing the image. + # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise, + # however pytorch interpolation works with negative values. + # relevant issue here: https://github.com/huggingface/transformers/issues/34920 + # input should be (B, C, H, W) + image_tensor = torch.from_numpy(image).unsqueeze(0) + resized_image = torch.nn.functional.interpolate( + input=image_tensor, + size=output_size, + mode=pil_torch_interpolation_mapping[resample].value, + ) + resized_image = resized_image.squeeze(0).numpy() + return resized_image + + def _validate_input_arguments( + self, + do_resize: bool, + size: dict[str, int], + resample: PILImageResampling, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, list[float]], + image_std: Union[float, list[float]], + data_format: Union[str, ChannelDimension], + ): + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + self._validate_input_arguments( + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + # depth-pro rescales and normalizes the image before resizing it + # uses torch interpolation which requires ChannelDimension.FIRST + if do_resize: + image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) + image = self.resize(image=image, size=size, resample=resample) + image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST) + else: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + + all_images.append(image) + + data = {"pixel_values": all_images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "DepthProDepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None, + ) -> list[dict[str, TensorType]]: + """ + Post-processes the raw depth predictions from the model to generate + final depth predictions which is caliberated using the field of view if provided + and resized to specified target sizes if provided. + + Args: + outputs ([`DepthProDepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`Optional[Union[TensorType, list[tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. + + Returns: + `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + fov = outputs.field_of_view + + batch_size = len(predicted_depth) + + if target_sizes is not None and batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + results = [] + fov = [None] * batch_size if fov is None else fov + target_sizes = [None] * batch_size if target_sizes is None else target_sizes + for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes): + focal_length = None + if target_size is not None: + # scale image w.r.t fov + if fov_value is not None: + width = target_size[1] + focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * width / focal_length + + # interpolate + depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=depth.unsqueeze(0).unsqueeze(1), + size=target_size, + mode=pil_torch_interpolation_mapping[self.resample].value, + ).squeeze() + + # inverse the depth + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + + results.append( + { + "predicted_depth": depth, + "field_of_view": fov_value, + "focal_length": focal_length, + } + ) + + return results + + +__all__ = ["DepthProImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..7c32703b7c25526bc87beabf1f6cbb835f2b2478 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py @@ -0,0 +1,1131 @@ +# coding=utf-8 +# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DepthPro model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, logging, torch_int +from ..auto import AutoModel +from .configuration_depth_pro import DepthProConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DepthPro's outputs. + """ +) +class DepthProOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*): + Features from encoders. Can be a single feature or a list of features. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + features: Union[torch.FloatTensor, list[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DepthProForDepthEstimation's output. + """ +) +class DepthProDepthEstimatorOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided): + Field of View Scaler. + """ + + loss: Optional[torch.FloatTensor] = None + predicted_depth: Optional[torch.FloatTensor] = None + field_of_view: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor: + """Creates Patches from Batch.""" + batch_size, num_channels, height, width = pixel_values.shape + + if height == width == patch_size: + # create patches only if scaled image is not already equal to patch size + return pixel_values + + stride = torch_int(patch_size * (1 - overlap_ratio)) + + patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)) + patches = patches.permute(2, 0, 1) + patches = patches.reshape(-1, num_channels, patch_size, patch_size) + + return patches + + +def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor: + """Discard class token and reshape 1D feature map to a 2D grid.""" + n_samples, seq_len, hidden_size = hidden_states.shape + size = torch_int(seq_len**0.5) + + hidden_states = hidden_states[:, -(size**2) :, :] # remove special tokens if there are any + hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size) + hidden_states = hidden_states.permute(0, 3, 1, 2) + + return hidden_states + + +def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor: + """Merges smaller patches into image-like feature map.""" + n_patches, hidden_size, out_size, out_size = patches.shape + n_patches_per_batch = n_patches // batch_size + sqrt_n_patches_per_batch = torch_int(n_patches_per_batch**0.5) + new_out_size = sqrt_n_patches_per_batch * out_size + + if n_patches == batch_size: + # merge only if the patches were created from scaled image + # patches are not created when scaled image size is equal to patch size + return patches + + if n_patches_per_batch < 4: + # for each batch, at least 4 small patches are required to + # recreate a large square patch from merging them and later padding is applied + # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding) + # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later) + # 5 x (8x8) patches becomes 1 x (16x16) patch (extra patch ignored, padding later) + # 9 x (8x8) patches becomes 1 x (24x24) patch (padding later) + # thus the following code only rearranges the patches and removes extra ones + padding = 0 + + # make sure padding is not large enough to remove more than half of the patch + padding = min(out_size // 4, padding) + + if padding == 0: + # faster when no padding is required + merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size) + merged = merged.permute(1, 2, 0, 3, 4) + merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :] + merged = merged.reshape( + batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size + ) + merged = merged.permute(0, 1, 2, 4, 3, 5) + merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size) + else: + # padding example: + # let out_size = 8, new_out_size = 32, padding = 2 + # each patch is separated by "|" + # and padding is applied to the merging edges of each patch + # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31 + # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31 + i = 0 + boxes = [] + for h in range(sqrt_n_patches_per_batch): + boxes_in_row = [] + for w in range(sqrt_n_patches_per_batch): + box = patches[batch_size * i : batch_size * (i + 1)] + + # collect paddings + paddings = [0, 0, 0, 0] + if h != 0: + # remove pad from height if box is not at top border + paddings[0] = padding + if w != 0: + # remove pad from width if box is not at left border + paddings[2] = padding + if h != sqrt_n_patches_per_batch - 1: + # remove pad from height if box is not at bottom border + paddings[1] = padding + if w != sqrt_n_patches_per_batch - 1: + # remove pad from width if box is not at right border + paddings[3] = padding + + # remove paddings + _, _, box_h, box_w = box.shape + pad_top, pad_bottom, pad_left, pad_right = paddings + box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right] + + boxes_in_row.append(box) + i += 1 + boxes_in_row = torch.cat(boxes_in_row, dim=-1) + boxes.append(boxes_in_row) + merged = torch.cat(boxes, dim=-2) + + return merged + + +def reconstruct_feature_maps( + hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: tuple[float, float] +) -> torch.Tensor: + """ + Reconstructs feature maps from the hidden state produced by any of the encoder. Converts the hidden state of shape + `(n_patches_per_batch * batch_size, seq_len, hidden_size)` to feature maps of shape + `(batch_size, hidden_size, output_size[0], output_size[1])`. + + Args: + hidden_state (torch.Tensor): Input tensor of shape `(n_patches_per_batch * batch_size, seq_len, hidden_size)` + representing the encoded patches. + batch_size (int): The number of samples in a batch. + padding (int): The amount of padding to be removed when merging patches. + output_size (tuple[float, float]): The desired output size for the feature maps, specified as `(height, width)`. + + Returns: + torch.Tensor: Reconstructed feature maps of shape `(batch_size, hidden_size, output_size[0], output_size[1])`. + """ + # reshape back to image like + features = reshape_features(hidden_state) + + # merge all patches in a batch to create one large patch per batch + features = merge_patches( + features, + batch_size=batch_size, + padding=padding, + ) + + # interpolate patches to base size + features = F.interpolate( + features, + size=output_size, + mode="bilinear", + align_corners=False, + ) + + return features + + +class DepthProPatchEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.intermediate_hook_ids = config.intermediate_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.merge_padding_value = config.merge_padding_value + + self.n_scaled_images = len(config.scaled_images_ratios) + self.n_intermediate_hooks = len(config.intermediate_hook_ids) + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.patch_model_config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> list[torch.Tensor]: + batch_size, num_channels, height, width = pixel_values.shape + + if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size: + raise ValueError( + f"Image size {height}x{width} is too small to be scaled " + f"with scaled_images_ratios={self.scaled_images_ratios} " + f"when patch_size={self.config.patch_size}." + ) + + # STEP 1: create 3-level image + + scaled_images = [] + for ratio in self.scaled_images_ratios: + scaled_images.append( + F.interpolate( + pixel_values, + scale_factor=ratio, + mode="bilinear", + align_corners=False, + ) + ) + + # STEP 2: create patches + + for i in range(self.n_scaled_images): + scaled_images[i] = split_to_patches( + scaled_images[i], + patch_size=self.config.patch_size, + overlap_ratio=self.scaled_images_overlap_ratios[i], + ) + n_patches_per_scaled_image = [len(i) for i in scaled_images] + patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first + + # STEP 3: apply patch encoder + + encodings = self.model( + # each patch is processed as a separate batch + patches, + head_mask=head_mask, + # required for intermediate features + output_hidden_states=self.n_intermediate_hooks > 0, + ) + + scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1]) + # -1 (reverse list) as patch encoder returns high res patches first, we need low res first + scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram + + scaled_images_features = [] + for i in range(self.n_scaled_images): + hidden_state = scaled_images_last_hidden_state[i] + padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])) + output_height = base_height * 2**i + output_width = base_width * 2**i + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=padding, + output_size=(output_height, output_width), + ) + scaled_images_features.append(features) + + # STEP 5: get intermediate features - (1-2) in diagram + + intermediate_features = [] + for i in range(self.n_intermediate_hooks): + # +1 to correct index position as hidden_states contain embedding output as well + hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1] + padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])) + output_height = base_height * 2 ** (self.n_scaled_images - 1) + output_width = base_width * 2 ** (self.n_scaled_images - 1) + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=padding, + output_size=(output_height, output_width), + ) + intermediate_features.append(features) + + # STEP 7: combine all features + features = [*scaled_images_features, *intermediate_features] + + return features + + +class DepthProImageEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.image_model_config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image for image_encoder + size = self.config.image_model_config.image_size + pixel_values = F.interpolate( + pixel_values, + size=(size, size), + mode="bilinear", + align_corners=False, + ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + encodings[0], + batch_size=batch_size, + padding=0, + output_size=(base_height, base_width), + ) + + if not return_dict: + return (encodings[0], features) + encodings[2:] # ignore last_hidden_state and poooler output + + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) + + +class DepthProEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.intermediate_hook_ids = config.intermediate_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.merge_padding_value = config.merge_padding_value + + self.n_scaled_images = len(self.scaled_images_ratios) + self.n_intermediate_hooks = len(self.intermediate_hook_ids) + + self.patch_encoder = DepthProPatchEncoder(config) + self.image_encoder = DepthProImageEncoder(config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + patch_features = self.patch_encoder( + pixel_values, + head_mask=head_mask, + ) + image_encodings = self.image_encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_features = image_encodings[1] # index 1 contains features + + features = [image_features, *patch_features] + + if not return_dict: + return (image_encodings[0], features) + image_encodings[2:] + + return DepthProOutput( + last_hidden_state=image_encodings.last_hidden_state, + features=features, + hidden_states=image_encodings.hidden_states, + attentions=image_encodings.attentions, + ) + + +class DepthProFeatureUpsampleBlock(nn.Module): + def __init__( + self, + config: DepthProConfig, + input_dims: int, + intermediate_dims: int, + output_dims: int, + n_upsample_layers: int, + use_proj: bool = True, + bias: bool = False, + ): + super().__init__() + self.config = config + self.layers = nn.ModuleList() + + # create first projection layer + if use_proj: + proj = nn.Conv2d( + in_channels=input_dims, + out_channels=intermediate_dims, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.layers.append(proj) + + # create following upsample layers + for i in range(n_upsample_layers): + in_channels = intermediate_dims if i == 0 else output_dims + layer = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=bias, + ) + self.layers.append(layer) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + features = layer(features) + return features + + +class DepthProFeatureUpsample(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.n_scaled_images = len(self.config.scaled_images_ratios) + self.n_intermediate_hooks = len(self.config.intermediate_hook_ids) + + # for image_features + self.image_block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.image_model_config.hidden_size, + intermediate_dims=config.image_model_config.hidden_size, + output_dims=config.scaled_images_feature_dims[0], + n_upsample_layers=1, + use_proj=False, + bias=True, + ) + + # for scaled_images_features + self.scaled_images = nn.ModuleList() + for i, feature_dims in enumerate(config.scaled_images_feature_dims): + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=feature_dims, + output_dims=feature_dims, + n_upsample_layers=1, + ) + self.scaled_images.append(block) + + # for intermediate_features + self.intermediate = nn.ModuleList() + for i, feature_dims in enumerate(config.intermediate_feature_dims): + intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=intermediate_dims, + output_dims=feature_dims, + n_upsample_layers=2 + i, + ) + self.intermediate.append(block) + + def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]: + features[0] = self.image_block(features[0]) + + for i in range(self.n_scaled_images): + features[i + 1] = self.scaled_images[i](features[i + 1]) + + for i in range(self.n_intermediate_hooks): + features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1]) + + return features + + +class DepthProFeatureProjection(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims + self.projections = nn.ModuleList() + for i, in_channels in enumerate(combined_feature_dims): + if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size: + # projection for last layer can be ignored if input and output channels already match + self.projections.append(nn.Identity()) + else: + self.projections.append( + nn.Conv2d( + in_channels=in_channels, + out_channels=config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + ) + + def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]: + projected_features = [] + for i, projection in enumerate(self.projections): + upsampled_feature = projection(features[i]) + projected_features.append(upsampled_feature) + return projected_features + + +class DepthProNeck(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.feature_upsample = DepthProFeatureUpsample(config) + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.scaled_images_feature_dims[0] * 2, + out_channels=config.scaled_images_feature_dims[0], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.feature_projection = DepthProFeatureProjection(config) + + def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]: + features = self.feature_upsample(features) + # global features = low res features + image features + global_features = torch.cat((features[1], features[0]), dim=1) + global_features = self.fuse_image_with_low_res(global_features) + features = [global_features, *features[2:]] + features = self.feature_projection(features) + return features + + +# General docstring + + +@auto_docstring +class DepthProPreTrainedModel(PreTrainedModel): + config: DepthProConfig + base_model_prefix = "depth_pro" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _supports_sdpa = True + _no_split_modules = ["DepthProPreActResidualLayer"] + _keys_to_ignore_on_load_unexpected = ["fov_model.*"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DepthProModel(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.encoder = DepthProEncoder(config) + self.neck = DepthProNeck(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.encoder.image_encoder.model.get_input_embeddings() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, DepthProOutput]: + r""" + Examples: + + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, DepthProModel + + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> checkpoint = "apple/DepthPro-hf" + >>> processor = AutoProcessor.from_pretrained(checkpoint) + >>> model = DepthProModel.from_pretrained(checkpoint) + + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... output = model(**inputs) + + >>> output.last_hidden_state.shape + torch.Size([1, 35, 577, 1024]) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encodings = self.encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + features = encodings[1] # index 1 contains features + features = self.neck(features) + + if not return_dict: + return (encodings[0], features) + encodings[2:] + + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) + + +# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro +class DepthProPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthProConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config: DepthProConfig): + super().__init__() + + self.use_batch_norm = config.use_batch_norm_in_fusion_residual + use_bias_in_fusion_residual = ( + config.use_bias_in_fusion_residual + if config.use_bias_in_fusion_residual is not None + else not self.use_batch_norm + ) + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + if self.use_batch_norm: + self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size) + self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + + hidden_state = self.convolution1(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm1(hidden_state) + + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm2(hidden_state) + + return hidden_state + residual + + +# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +# except it uses deconv and skip_add and needs no interpolation +class DepthProFeatureFusionLayer(nn.Module): + def __init__(self, config: DepthProConfig, use_deconv: bool = True): + super().__init__() + self.config = config + self.use_deconv = use_deconv + + self.residual_layer1 = DepthProPreActResidualLayer(config) + self.residual_layer2 = DepthProPreActResidualLayer(config) + + if self.use_deconv: + self.deconv = nn.ConvTranspose2d( + in_channels=config.fusion_hidden_size, + out_channels=config.fusion_hidden_size, + kernel_size=2, + stride=2, + padding=0, + bias=False, + ) + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor: + if residual is not None: + residual = self.residual_layer1(residual) + hidden_state = hidden_state + residual + + hidden_state = self.residual_layer2(hidden_state) + if self.use_deconv: + hidden_state = self.deconv(hidden_state) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro +# with deconv and reversed layers +class DepthProFeatureFusionStage(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + self.intermediate = nn.ModuleList() + for _ in range(self.num_layers - 1): + self.intermediate.append(DepthProFeatureFusionLayer(config)) + + # final layer does not require deconvolution + self.final = DepthProFeatureFusionLayer(config, use_deconv=False) + + def forward(self, hidden_states: list[torch.Tensor]) -> list[torch.Tensor]: + if self.num_layers != len(hidden_states): + raise ValueError( + f"num_layers={self.num_layers} in DepthProFeatureFusionStage" + f"does not match len(hidden_states)={len(hidden_states)}" + ) + + fused_hidden_states = [] + fused_hidden_state = None + for hidden_state, layer in zip(hidden_states[:-1], self.intermediate): + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + hidden_state = hidden_states[-1] + fused_hidden_state = self.final(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +class DepthProFovEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.fov_model_config) + self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image for fov_encoder + size = self.config.fov_model_config.image_size + pixel_values = F.interpolate( + pixel_values, + size=(size, size), + mode="bilinear", + align_corners=False, + ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + ) + hidden_state = encodings[0] + hidden_state = self.neck(hidden_state) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=0, + output_size=(base_height, base_width), + ) + + return features + + +class DepthProFovHead(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + # create initial head layers + self.layers = nn.ModuleList() + for i in range(config.num_fov_head_layers): + self.layers.append( + nn.Conv2d( + math.ceil(self.fusion_hidden_size / 2 ** (i + 1)), + math.ceil(self.fusion_hidden_size / 2 ** (i + 2)), + kernel_size=3, + stride=2, + padding=1, + ) + ) + self.layers.append(nn.ReLU(True)) + # calculate expected shapes to finally generate a scalar output from final head layer + final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1)) + final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) + self.layers.append( + nn.Conv2d( + in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0 + ) + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + features = F.interpolate( + features, + size=(self.out_size, self.out_size), + mode="bilinear", + align_corners=False, + ) + for layer in self.layers: + features = layer(features) + return features + + +class DepthProFovModel(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + + self.fov_encoder = DepthProFovEncoder(config) + self.conv = nn.Conv2d( + self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1 + ) + self.activation = nn.ReLU(inplace=True) + self.head = DepthProFovHead(config) + + def forward( + self, + pixel_values: torch.Tensor, + global_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + fov_features = self.fov_encoder(pixel_values, head_mask) + + global_features = self.conv(global_features) + global_features = self.activation(global_features) + + fov_features = fov_features + global_features + fov_output = self.head(fov_features) + fov_output = fov_output.flatten() + + return fov_output + + +class DepthProDepthEstimationHead(nn.Module): + """ + The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks. + This module comprises a sequence of convolutional and transposed convolutional layers + that process the feature map from the fusion to produce a single-channel depth map. + Key operations include dimensionality reduction and upsampling to match the input resolution. + """ + + def __init__(self, config): + super().__init__() + self.config = config + + features = config.fusion_hidden_size + self.layers = nn.ModuleList( + [ + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), + nn.ConvTranspose2d( + in_channels=features // 2, + out_channels=features // 2, + kernel_size=2, + stride=2, + padding=0, + bias=True, + ), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + hidden_states = layer(hidden_states) + + predicted_depth = hidden_states.squeeze(dim=1) + return predicted_depth + + +@auto_docstring( + custom_intro=""" + DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). + """ +) +class DepthProForDepthEstimation(DepthProPreTrainedModel): + def __init__(self, config, use_fov_model=None): + r""" + use_fov_model (bool, *optional*): + Whether to use the field of view model. + """ + super().__init__(config) + self.config = config + self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model + + # dinov2 (vit) like encoders + self.depth_pro = DepthProModel(config) + + # dpt (vit) like fusion stage + self.fusion_stage = DepthProFeatureFusionStage(config) + + # depth estimation head + self.head = DepthProDepthEstimationHead(config) + + # dinov2 (vit) like encoder + self.fov_model = DepthProFovModel(config) if self.use_fov_model else None + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], DepthProDepthEstimatorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> checkpoint = "apple/DepthPro-hf" + >>> processor = AutoImageProcessor.from_pretrained(checkpoint) + >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) + + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> model.to(device) + + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt").to(device) + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = processor.post_process_depth_estimation( + ... outputs, target_sizes=[(image.height, image.width)], + ... ) + + >>> # get the field of view (fov) predictions + >>> field_of_view = post_processed_output[0]["field_of_view"] + >>> focal_length = post_processed_output[0]["focal_length"] + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + depth_pro_outputs = self.depth_pro( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + features = depth_pro_outputs.features + fused_hidden_states = self.fusion_stage(features) + predicted_depth = self.head(fused_hidden_states[-1]) + + if self.use_fov_model: + # frozen features from encoder are used + features_for_fov = features[0].detach() + fov = self.fov_model( + pixel_values=pixel_values, + global_features=features_for_fov, + head_mask=head_mask, + ) + else: + fov = None + + if not return_dict: + outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions] + return tuple(v for v in outputs if v is not None) + + return DepthProDepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + field_of_view=fov, + hidden_states=depth_pro_outputs.hidden_states, + attentions=depth_pro_outputs.attentions, + ) + + +__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c1abeb3b4be1f35d8ebc613c9d19ec8e3f75f24 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0c97458ae20ac023eb69977d3e9bdcdc1ffabbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5066fb0d8d67fe8964d1fb860075e4c62083a605 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a017d6b6cf53f71381b44f9dd654b4cbf0432c8a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63825ea821e2eab4aa56993a9ef83ce5d589f7a0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bab463964990fd41c09ed5f65311e093a071c19 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d738fbc087888597da19735271366d4e35ab708c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dia import * + from .feature_extraction_dia import * + from .generation_dia import * + from .modeling_dia import * + from .processing_dia import * + from .tokenization_dia import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/configuration_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/configuration_dia.py new file mode 100644 index 0000000000000000000000000000000000000000..d4dec60b3e4853574e4d528e7b641507a8c0b414 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/configuration_dia.py @@ -0,0 +1,376 @@ +# coding=utf-8 +# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dia model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DiaEncoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DiaEncoder`]. It is used to instantiate a Dia + encoder according to the specified arguments, defining the encoder architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + Number of key and value heads for each attention layer in the Transformer encoder. + head_dim (`int`, *optional*, defaults to 128): + Dimensionality of the attention head. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the normalization layers. + vocab_size (`int`, *optional*, defaults to 256): + Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DiaModel`]. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"swish"` and `"gelu_new"` are supported. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + """ + + model_type = "dia_encoder" + + def __init__( + self, + max_position_embeddings: int = 1024, + num_hidden_layers: int = 12, + hidden_size: int = 1024, + num_attention_heads: int = 16, + num_key_value_heads: int = 16, + head_dim: int = 128, + intermediate_size: int = 4096, + norm_eps: float = 1e-5, + vocab_size: int = 256, + hidden_act: str = "silu", + rope_theta: float = 10000.0, + rope_scaling: Optional[dict] = None, + initializer_range: float = 0.02, + **kwargs, + ): + self.max_position_embeddings = max_position_embeddings + self.num_hidden_layers = num_hidden_layers + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.norm_eps = norm_eps + self.vocab_size = vocab_size + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + self.initializer_range = initializer_range + super().__init__(**kwargs) + + +class DiaDecoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DiaDecoder`]. It is used to instantiate a Dia + decoder according to the specified arguments, defining the decoder architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + max_position_embeddings (`int`, *optional*, defaults to 3072): + The maximum sequence length that this model might ever be used with. + num_hidden_layers (`int`, *optional*, defaults to 18): + Number of hidden layers in the Transformer decoder. + hidden_size (`int`, *optional*, defaults to 2048): + Dimensionality of the decoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + Number of key and value heads for each attention layer in the Transformer decoder. + head_dim (`int`, *optional*, defaults to 128): + Dimensionality of the attention head. + cross_num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each cross-attention layer in the Transformer decoder. + cross_head_dim (`int`, *optional*, defaults to 128): + Dimensionality of the cross-attention head. + cross_num_key_value_heads (`int`, *optional*, defaults to 16): + Number of key and value heads for each cross-attention layer in the Transformer decoder. + cross_hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the cross-attention layers. + norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the normalization layers. + vocab_size (`int`, *optional*, defaults to 1028): + Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DiaModel`]. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, `"relu"`, + `"swish"` and `"gelu_new"` are supported. + num_channels (`int`, *optional*, defaults to 9): + Number of channels for the Dia decoder. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Indicating that this model is part of an encoder-decoder architecture. + """ + + model_type = "dia_decoder" + + def __init__( + self, + max_position_embeddings: int = 3072, + num_hidden_layers: int = 18, + hidden_size: int = 2048, + intermediate_size: int = 8192, + num_attention_heads: int = 16, + num_key_value_heads: int = 4, + head_dim: int = 128, + cross_num_attention_heads: int = 16, + cross_head_dim: int = 128, + cross_num_key_value_heads: int = 16, + cross_hidden_size: int = 1024, + norm_eps: float = 1e-5, + vocab_size: int = 1028, + hidden_act: str = "silu", + num_channels: int = 9, + rope_theta: float = 10000.0, + rope_scaling: Optional[dict] = None, + initializer_range: float = 0.02, + use_cache: bool = True, + is_encoder_decoder: bool = True, + **kwargs, + ): + self.max_position_embeddings = max_position_embeddings + self.num_hidden_layers = num_hidden_layers + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.head_dim = head_dim + self.cross_num_key_value_heads = cross_num_key_value_heads + self.cross_num_attention_heads = cross_num_attention_heads + self.cross_head_dim = cross_head_dim + self.cross_hidden_size = cross_hidden_size + self.norm_eps = norm_eps + self.vocab_size = vocab_size + self.hidden_act = hidden_act + self.num_channels = num_channels + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + self.initializer_range = initializer_range + self.use_cache = use_cache + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + +class DiaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DiaModel`]. It is used to instantiate a + Dia model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the + [nari-labs/Dia-1.6B](https://huggingface.co/nari-labs/Dia-1.6B) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + encoder_config (`DiaEncoderConfig`, *optional*): + Configuration for the encoder part of the model. If not provided, a default `DiaEncoderConfig` will be used. + decoder_config (`DiaDecoderConfig`, *optional*): + Configuration for the decoder part of the model. If not provided, a default `DiaDecoderConfig` will be used. + norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the normalization layers. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Indicating that this model uses an encoder-decoder architecture. + pad_token_id (`int`, *optional*, defaults to 1025): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1024): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 1026): + Beginning of stream token id. + delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`): + The delay pattern for the decoder. The length of this list must match `decoder_config.num_channels`. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + + Example: + + ```python + >>> from transformers import DiaConfig, DiaModel + + >>> # Initializing a DiaConfig with default values + >>> configuration = DiaConfig() + + >>> # Initializing a DiaModel (with random weights) from the configuration + >>> model = DiaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "dia" + keys_to_ignore_at_inference = ["past_key_values"] + sub_configs = {"encoder_config": DiaEncoderConfig, "decoder_config": DiaDecoderConfig} + + def __init__( + self, + encoder_config: Optional[DiaEncoderConfig] = None, + decoder_config: Optional[DiaDecoderConfig] = None, + norm_eps: float = 1e-5, + is_encoder_decoder: bool = True, + pad_token_id: int = 1025, + eos_token_id: int = 1024, + bos_token_id: int = 1026, + delay_pattern: Optional[list[int]] = None, + initializer_range: float = 0.02, + use_cache: bool = True, + **kwargs, + ): + if isinstance(encoder_config, dict): + encoder_config = DiaEncoderConfig(**encoder_config) + if isinstance(decoder_config, dict): + decoder_config = DiaDecoderConfig(**decoder_config) + self.encoder_config = encoder_config if encoder_config is not None else DiaEncoderConfig() + self.decoder_config = decoder_config if decoder_config is not None else DiaDecoderConfig() + self.norm_eps = norm_eps + self.delay_pattern = delay_pattern if delay_pattern is not None else [0, 8, 9, 10, 11, 12, 13, 14, 15] + self.initializer_range = initializer_range + self.use_cache = use_cache + + assert self.decoder_config.num_channels == len(self.delay_pattern), ( + "Number of channels must match delay pattern length." + ) + + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + bos_token_id=bos_token_id, + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + + def get_text_config(self, *args, **kwargs): + """Defaulting to audio config as it's the decoder in this case which is usually the text backbone""" + return self.decoder_config + + +__all__ = ["DiaConfig", "DiaEncoderConfig", "DiaDecoderConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/feature_extraction_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/feature_extraction_dia.py new file mode 100644 index 0000000000000000000000000000000000000000..b4376b773b27365932774a35746b2928cf0af707 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/feature_extraction_dia.py @@ -0,0 +1,183 @@ +# coding=utf-8 +# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for Dia""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import PaddingStrategy, TensorType, logging + + +logger = logging.get_logger(__name__) + + +class DiaFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs an Dia feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. + + Args: + feature_size (`int`, *optional*, defaults to 1): + The feature dimension of the extracted features. Use 1 for mono, 2 for stereo. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz). + padding_value (`float`, *optional*, defaults to 0.0): + The value that is used for padding. + hop_length (`int`, *optional*, defaults to 512): + Overlap length between successive windows. + """ + + model_input_names = ["input_values", "n_quantizers"] + + def __init__( + self, + feature_size: int = 1, + sampling_rate: int = 16000, + padding_value: float = 0.0, + hop_length: int = 512, + **kwargs, + ): + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + self.hop_length = hop_length + + def __call__( + self, + raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + padding: Optional[Union[bool, str, PaddingStrategy]] = None, + truncation: Optional[bool] = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + sampling_rate: Optional[int] = None, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). + + Args: + raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`): + The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape + `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio + (`feature_size = 2`). + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, *optional*, defaults to `False`): + Activates truncation to cut input sequences longer than `max_length` to `max_length`. + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + sampling_rate (`int`, *optional*): + The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + """ + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of" + f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with" + f" {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + if padding and truncation: + raise ValueError("Both padding and truncation were set. Make sure you only set one.") + elif padding is None: + # by default let's pad the inputs + padding = True + + is_batched = bool( + isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list))) + ) + + if is_batched: + raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio] + elif not is_batched and not isinstance(raw_audio, np.ndarray): + raw_audio = np.asarray(raw_audio, dtype=np.float32) + elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64): + raw_audio = raw_audio.astype(np.float32) + + # always return batch + if not is_batched: + raw_audio = [np.asarray(raw_audio).T] + + # convert stereo to mono if necessary, unique to Dia + for idx, example in enumerate(raw_audio): + if self.feature_size == 2 and example.ndim == 2: + raw_audio[idx] = np.mean(example, -1) + + # verify inputs are valid + for idx, example in enumerate(raw_audio): + if example.ndim > 2: + raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}") + if self.feature_size == 1 and example.ndim != 1: + raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels") + if self.feature_size == 2 and example.ndim != 1: # note the conversion before + raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels") + + input_values = BatchFeature({"input_values": raw_audio}) + + # temporarily treat it as if we were mono as we also convert stereo to mono + original_feature_size = self.feature_size + self.feature_size = 1 + + # normal padding on batch + padded_inputs = self.pad( + input_values, + max_length=max_length, + truncation=truncation, + padding=padding, + return_attention_mask=True, + pad_to_multiple_of=self.hop_length, + ) + padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask") + + input_values = [] + for example in padded_inputs.pop("input_values"): + if self.feature_size == 1: + example = example[..., None] + input_values.append(example.T) + + padded_inputs["input_values"] = input_values + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + # rewrite back to original feature size + self.feature_size = original_feature_size + + return padded_inputs + + +__all__ = ["DiaFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/generation_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/generation_dia.py new file mode 100644 index 0000000000000000000000000000000000000000..c297de7203d4e5b30a189047233976d310179907 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/generation_dia.py @@ -0,0 +1,463 @@ +# coding=utf-8 +# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Callable, Optional, Union + +import torch +import torch.distributed as dist + +from ...generation.logits_process import ( + DiaClassifierFreeGuidanceLogitsProcessor, + DiaEOSChannelFilterLogitsProcessor, + DiaEOSDelayPatternLogitsProcessor, + LogitsProcessorList, + TemperatureLogitsWarper, +) +from ...generation.stopping_criteria import StoppingCriteriaList +from ...generation.streamers import BaseStreamer +from ...generation.utils import GenerateOutput, GenerationConfig, GenerationMixin, GenerationMode +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_utils import PreTrainedModel +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DiaGenerationMixin(GenerationMixin): + # Indicates CFG which needs preparation to be properly handled by repeats + _uses_cfg = None + + def _get_logits_processor( + self, + generation_config: GenerationConfig, + input_ids_seq_length: Optional[int] = None, + encoder_input_ids: Optional[torch.LongTensor] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None, + logits_processor: Optional[LogitsProcessorList] = None, + device: Optional[str] = None, + model_kwargs: Optional[dict[str, Any]] = None, + negative_prompt_ids: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, + ) -> LogitsProcessorList: + # Need either custom order or custom processor instead + # (Temporarily disabling those for the super function) + original_guidance_scale = generation_config.guidance_scale + original_temperature = generation_config.temperature + generation_config.guidance_scale = None + generation_config.temperature = None + + # Get base processors and those we can integrate easily + custom_processors = LogitsProcessorList() + + if original_temperature is not None and original_temperature != 1.0: + custom_processors.append(TemperatureLogitsWarper(original_temperature)) + + custom_processors.append( + DiaEOSChannelFilterLogitsProcessor( + num_channels=len(self.config.delay_pattern), + eos_token_id=self.config.eos_token_id, + ) + ) + + merged_processors = super()._get_logits_processor( + generation_config=generation_config, + input_ids_seq_length=input_ids_seq_length, + encoder_input_ids=encoder_input_ids, + prefix_allowed_tokens_fn=None, + logits_processor=custom_processors, + device=device, + model_kwargs=model_kwargs, + negative_prompt_ids=negative_prompt_ids, + negative_prompt_attention_mask=negative_prompt_attention_mask, + ) + + # Custom processors we need at specific positions + if original_guidance_scale is not None and original_guidance_scale != 1: + cfg_processor = DiaClassifierFreeGuidanceLogitsProcessor( + guidance_scale=original_guidance_scale, + guidance_top_k=generation_config.top_k, + ) + merged_processors.insert(0, cfg_processor) + + merged_processors.append( + DiaEOSDelayPatternLogitsProcessor( + delay_pattern=self.config.delay_pattern, + eos_token_id=self.config.eos_token_id, + max_generation_len=generation_config.max_length, + device=device, + ) + ) + + # Enable temporarily disabled values back + generation_config.guidance_scale = original_guidance_scale + generation_config.temperature = original_temperature + + return merged_processors + + def _prepare_generation_config( + self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Any + ) -> tuple[GenerationConfig, dict]: + generation_config, model_kwargs = super()._prepare_generation_config( + generation_config, use_model_defaults, **kwargs + ) + + # We allow generation up to max length + max delay pattern + # (will revert back to max length after generation) + generation_config.max_length += max(self.config.delay_pattern) + + # Internal flag to indicate CFG that needs to prepare unconditioned input + self._uses_cfg = generation_config.guidance_scale is not None and generation_config.guidance_scale != 1 + + return generation_config, model_kwargs + + def _prepare_model_inputs( + self, + inputs: Optional[torch.Tensor] = None, + bos_token_id: Optional[torch.Tensor] = None, + model_kwargs: Optional[dict[str, torch.Tensor]] = None, + ) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]: + inputs, input_name, model_kwargs = super()._prepare_model_inputs( + inputs=inputs, + bos_token_id=bos_token_id, + model_kwargs=model_kwargs, + ) + + # If CFG is requested we fill in the unconditioned parts + if self._uses_cfg: + unconditioned_inputs = torch.zeros_like(inputs) + inputs = torch.cat([inputs, unconditioned_inputs], dim=0) + + if model_kwargs.get("attention_mask", None) is not None: + model_kwargs["attention_mask"] = model_kwargs["attention_mask"].repeat(2, 1) + + return inputs, input_name, model_kwargs + + def _prepare_decoder_input_ids_for_generation( + self, + batch_size: int, + model_input_name: str, + model_kwargs: dict[str, torch.Tensor], + decoder_start_token_id: torch.Tensor, + device: Optional[torch.device] = None, + ) -> tuple[torch.LongTensor, dict[str, torch.Tensor]]: + """Prepares `decoder_input_ids` for generation with encoder-decoder models""" + # 1. Check whether the user has defined `decoder_input_ids` and `decoder_attention_mask`; if not error out + decoder_input_ids = decoder_attention_mask = None + if model_kwargs is not None and "decoder_input_ids" in model_kwargs: + decoder_input_ids = model_kwargs.pop("decoder_input_ids") + if model_kwargs is not None and "decoder_attention_mask" in model_kwargs: + decoder_attention_mask = model_kwargs.pop("decoder_attention_mask") + + # We allow generating without preparation (no proper delay) but discourage it + if decoder_input_ids is None or decoder_attention_mask is None: + logger.warning_once( + "In order to generate with Dia, we need the processed audio input: Got `decoder_input_ids`:" + f" {decoder_input_ids is not None} and got `decoder_attention_mask`={decoder_attention_mask is not None}." + f" This can be achieved via the [`DiaProcessor`] but now defaulting to non-delayed generation." + ) + + num_channels = self.config.decoder_config.num_channels + real_batch_size = batch_size // 2 if self._uses_cfg else batch_size + + if decoder_input_ids is None: + decoder_input_ids = torch.full( + (real_batch_size, 1, num_channels), decoder_start_token_id, dtype=torch.long, device=device + ) + + decoder_attention_mask = torch.ones( + size=(real_batch_size, decoder_input_ids.shape[1]), dtype=torch.long, device=device + ) + + # 2. Determine the valid input and what works as mask within the input + delay_mask = decoder_input_ids.long() + valid_input_size = ( + decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == self.config.pad_token_id).sum(dim=-1).max() + ) + decoder_input_ids = delay_mask[:, :valid_input_size].transpose(1, 2).long() + decoder_attention_mask = decoder_attention_mask[:, :valid_input_size].long() + + # 3. Overwrite into model kwargs + model_kwargs["decoder_attention_mask"] = decoder_attention_mask + model_kwargs["decoder_delay_mask"] = delay_mask + + return decoder_input_ids, model_kwargs + + def prepare_inputs_for_generation( + self, + input_ids, + encoder_outputs=None, # Using this to easily get the batch size + decoder_delay_mask=None, + **kwargs, + ): + # Reshape decoder input_ids to 3D to be compile friendly and to fit the expected model input shape + batch_size = encoder_outputs[0].shape[0] // 2 if self._uses_cfg else encoder_outputs[0].shape[0] + input_ids = input_ids.reshape(batch_size, self.config.decoder_config.num_channels, -1).transpose(1, 2) + + # Base method handles most things except CFG and the delay pattern mask + model_inputs = super().prepare_inputs_for_generation(input_ids, encoder_outputs=encoder_outputs, **kwargs) + + # Post processing for CFG and overwriting via delay pattern mask + # 1. Delay pattern mask -- force tokens if not allowed to predict (!= pad_token in mask) + model_inputs["decoder_input_ids"] = self.apply_delay_mask( + input_ids, self.config.pad_token_id, decoder_delay_mask + ) + + # Depending on cache usage we need to pass all or just one + if model_inputs.get("use_cache", False) and model_inputs["cache_position"][0] > 0: + model_inputs["decoder_input_ids"] = model_inputs["decoder_input_ids"][:, -1, :][:, None, :] + + # Be compile friendly + model_inputs["decoder_input_ids"] = model_inputs["decoder_input_ids"].contiguous() + + # 2. Apply CFG duplication if needed + if self._uses_cfg: + for key in ["decoder_input_ids", "decoder_attention_mask", "decoder_position_ids"]: + if model_inputs.get(key, None) is not None: + # double first dimension and keep everything else the same + repeat_pattern = tuple([2] + [1] * (model_inputs[key].ndim - 1)) + model_inputs[key] = model_inputs[key].repeat(*repeat_pattern) + + return model_inputs + + @staticmethod + def apply_delay_mask(input_ids: torch.Tensor, pad_id: int, delay_mask: Optional[torch.Tensor]) -> torch.Tensor: + if delay_mask is None: + return input_ids + + mask_len = min(input_ids.shape[1], delay_mask.shape[1]) + valid_mask = delay_mask[:, :mask_len, :] + valid_input = input_ids[:, :mask_len, :] + + # Overwrite the respective parts of the input + input_ids[:, :mask_len, :] = torch.where(valid_mask == pad_id, valid_input, valid_mask) + + return input_ids + + def _main_generate_loop( + self, + inputs: Optional[torch.Tensor] = None, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None, + synced_gpus: Optional[bool] = None, + assistant_model: Optional["PreTrainedModel"] = None, + streamer: Optional["BaseStreamer"] = None, + negative_prompt_ids: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, + use_model_defaults: Optional[bool] = None, + custom_generate: Optional[str] = None, + **kwargs, + ): + # ********** mostly taken from main generate function up to calling the different methods (see NOTE) ********** + # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call + generation_mode_kwargs = self._extract_generation_mode_kwargs( + custom_generate, + kwargs, + synced_gpus, + assistant_model, + streamer, + ) + generation_config, model_kwargs = self._prepare_generation_config( + generation_config, use_model_defaults, **kwargs + ) + generation_mode = generation_config.get_generation_mode(assistant_model) + + self._validate_model_kwargs(model_kwargs.copy()) + self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs) + + # 2. Set generation parameters if not already defined + if synced_gpus is None: + synced_gpus = (is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)) and dist.get_world_size() > 1 + + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + + # 3. Define model inputs + kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None + inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs( + inputs, generation_config.bos_token_id, model_kwargs + ) + batch_size = inputs_tensor.shape[0] + + device = inputs_tensor.device + self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device) + + # 4. Define other model kwargs + if "encoder_outputs" not in model_kwargs: + # if model is encoder decoder encoder_outputs are created and added to `model_kwargs` + model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( + inputs_tensor, model_kwargs, model_input_name, generation_config + ) + + # 5. Prepare `input_ids` which will be used for auto-regressive generation + input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( + batch_size=batch_size, + model_input_name=model_input_name, + model_kwargs=model_kwargs, + decoder_start_token_id=generation_config._decoder_start_token_tensor, + device=inputs_tensor.device, + ) + + if generation_config.token_healing: + input_ids = self.heal_tokens(input_ids, generation_mode_kwargs.get("tokenizer")) + + if streamer is not None: + streamer.put(input_ids.cpu()) + + # 6. Prepare `max_length` depending on other stopping criteria. + # NOTE: incorrect `input_ids.shape[1]` previously + input_ids_length = input_ids.shape[-1] + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None + has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None + generation_config = self._prepare_generated_length( + generation_config=generation_config, + has_default_max_length=has_default_max_length, + has_default_min_length=has_default_min_length, + model_input_name=model_input_name, + inputs_tensor=inputs_tensor, + input_ids_length=input_ids_length, + ) + + # If the model supports `logits_to_keep` in forward(), set it to 1 to avoid computing the whole + # logit matrix. This can save a lot of memory during the first forward pass. Note that assisted decoding + # dynamically overrides this value as it can need more than the last token logits + if self._supports_logits_to_keep() and "logits_to_keep" not in model_kwargs: + model_kwargs["logits_to_keep"] = 1 + + self._validate_generated_length(generation_config, input_ids_length, has_default_max_length) + + # 7. Prepare the cache. + # - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`. + # - different models have a different cache name expected by the model (default = "past_key_values") + # - `max_length`, prepared above, is used to determine the maximum cache length + max_cache_length = generation_config.max_length - 1 + if ( + inputs_tensor.shape[1] != input_ids_length + and model_input_name == "inputs_embeds" + and not self.config.is_encoder_decoder + ): + max_cache_length += inputs_tensor.shape[1] + self._prepare_cache_for_generation( + generation_config, model_kwargs, generation_mode, batch_size, max_cache_length + ) + + # 8. prepare logits processors and stopping criteria + prepared_logits_processor = self._get_logits_processor( + generation_config=generation_config, + input_ids_seq_length=input_ids_length, + encoder_input_ids=inputs_tensor, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + logits_processor=logits_processor, + device=inputs_tensor.device, + model_kwargs=model_kwargs, + negative_prompt_ids=negative_prompt_ids, + negative_prompt_attention_mask=negative_prompt_attention_mask, + ) + prepared_stopping_criteria = self._get_stopping_criteria( + generation_config=generation_config, + stopping_criteria=stopping_criteria, + tokenizer=generation_mode_kwargs.get("tokenizer"), + ) + + # Set model_kwargs `use_cache` so we can use it later in forward runs + model_kwargs["use_cache"] = generation_config.use_cache + # ******************* taken from main generate function up to calling the different methods ******************* + + # Prepare inner 2D logic in generation loop + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) + + # 10. go into different generation modes + if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH): + # 11. expand input_ids with `num_return_sequences` additional sequences per batch + if generation_config.num_return_sequences > 1: + raise ValueError("`num_return_sequences>1` is incompatible with Dia.") + + # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`) + return self._sample( + input_ids, + logits_processor=prepared_logits_processor, + stopping_criteria=prepared_stopping_criteria, + generation_config=generation_config, + **generation_mode_kwargs, + **model_kwargs, + ) + else: + raise ValueError( + "Got incompatible mode for generation, should be one of greedy or sampling. " + "Ensure that beam search is de-activated by setting `num_beams=1`." + ) + + @torch.no_grad() + def generate( + self, + inputs: Optional[torch.Tensor] = None, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None, + synced_gpus: Optional[bool] = None, + assistant_model: Optional["PreTrainedModel"] = None, + streamer: Optional["BaseStreamer"] = None, + negative_prompt_ids: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, + use_model_defaults: Optional[bool] = None, + custom_generate: Optional[str] = None, + **kwargs, + ) -> Union[GenerateOutput, torch.LongTensor]: + # We expect the initial input ids to be the complete mask (delayed input) + delay_mask = kwargs.get("decoder_input_ids") + if delay_mask is not None: + delay_mask = delay_mask.clone() + + output = self._main_generate_loop( + inputs=inputs, + generation_config=generation_config, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + synced_gpus=synced_gpus, + assistant_model=assistant_model, + streamer=streamer, + negative_prompt_ids=negative_prompt_ids, + negative_prompt_attention_mask=negative_prompt_attention_mask, + use_model_defaults=use_model_defaults, + custom_generate=custom_generate, + **kwargs, + ) + + return_dict_in_generate = not isinstance(output, torch.Tensor) + + if return_dict_in_generate: + output_sequences = output.sequences + else: + output_sequences = output + + # Reshape from 2D (bsz * channels, seq_len) to 3D (bsz, seq_len, channels) + num_channels = self.config.decoder_config.num_channels + bsz = output_sequences.shape[0] // num_channels + output_sequences = output_sequences.reshape(bsz, num_channels, -1).transpose(1, 2) + + # Apply delay mask + output_sequences = self.apply_delay_mask(output_sequences, self.config.pad_token_id, delay_mask) + + if return_dict_in_generate: + output.sequences = output_sequences + else: + output = output_sequences + + return output diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.py new file mode 100644 index 0000000000000000000000000000000000000000..cf662b224aabd884521d7e14b8a167886377f4b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.py @@ -0,0 +1,958 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dia/modular_dia.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dia.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_torch_flex_attn_available, + is_torchdynamo_compiling, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_dia import DiaConfig, DiaDecoderConfig, DiaEncoderConfig +from .generation_dia import DiaGenerationMixin + + +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +@auto_docstring +class DiaPreTrainedModel(PreTrainedModel): + config: DiaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = True + main_input_name = "input_ids" + _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"] + + +class DiaMultiChannelEmbedding(nn.Module): + """In order to efficiently compute the audio embedding from the 9 different channels, + we vectorize the embedding process by using a single embedding layer and an offset. + Example: + - num_embeds = 4 + - vocab_size = 8 + - num_channels = 3 + We would have offsets = [0, 8, 16] + If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8], + then tokens = audio_codes + offsets + = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24] + This allows us to use a single embedding layer for all channels. + """ + + def __init__(self, config: DiaDecoderConfig): + super().__init__() + self.embed = nn.Embedding(config.vocab_size * config.num_channels, config.hidden_size) + self.hidden_size = config.hidden_size + self.num_channels = config.num_channels + offsets = torch.arange(config.num_channels, dtype=torch.long) * config.vocab_size # (C,) + self.register_buffer("offsets", offsets, persistent=False) + + def forward(self, audio_codes: torch.Tensor) -> torch.Tensor: + tokens = (audio_codes + self.offsets.to(audio_codes.device)).squeeze(1) + embeds = self.embed(tokens).view(tokens.shape[0], audio_codes.shape[1], -1, self.hidden_size) + return embeds.sum(dim=2) + + +class DiaMLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +@use_kernel_forward_from_hub("RMSNorm") +class DiaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + DiaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class DiaRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: DiaConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class DiaSelfAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.num_heads = self.config.num_attention_heads + self.num_key_value_heads = self.config.num_key_value_heads or self.num_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads) + self.scaling = 1 + self.attention_dropout = 0.0 + self.is_causal = is_causal + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DiaCrossAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DiaDecoderConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.cross_hidden_size = config.cross_hidden_size + self.num_heads = self.config.cross_num_attention_heads + self.num_key_value_heads = self.config.cross_num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.head_dim = config.cross_head_dim + self.scaling = 1 + self.attention_dropout = 0.0 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + cross_attention_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + cross_shape = (*cross_attention_states.shape[:-1], -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False + if past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = past_key_values.cross_attention_cache.layers[self.layer_idx].keys + value_states = past_key_values.cross_attention_cache.layers[self.layer_idx].values + else: + key_states = self.k_proj(cross_attention_states).view(cross_shape).transpose(1, 2) + value_states = self.v_proj(cross_attention_states).view(cross_shape).transpose(1, 2) + + if past_key_values is not None: + # save all states to the cache + key_states, value_states = past_key_values.cross_attention_cache.update( + key_states, + value_states, + self.layer_idx, + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + past_key_values.is_updated[self.layer_idx] = True + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape((*input_shape, -1)).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DiaEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DiaEncoderConfig, layer_idx: int): + super().__init__() + self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=False) + self.post_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.mlp = DiaMLP(config) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + residual = hidden_states + normed_states = self.pre_sa_norm(hidden_states) + self_attn_output, self_attn_weights = self.self_attention( + normed_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + **kwargs, + ) + hidden_states = residual + self_attn_output + + residual = hidden_states + normed_states = self.post_sa_norm(hidden_states) + mlp_out = self.mlp(normed_states) + hidden_states = residual + mlp_out + + return hidden_states, self_attn_weights + + +class DiaEncoder(DiaPreTrainedModel): + def __init__(self, config: DiaEncoderConfig): + super().__init__(config) + self.config = config + + self.embedding = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList( + [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.rotary_embeddings = DiaRotaryEmbedding(config) + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[BaseModelOutput, tuple]: + hidden_states = self.embedding(input_ids) + + # RoPE + # Note: We expect right padding and hence always generate + # the position ids on the fly to reduce preparation overhead + position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :] + position_embeddings = self.rotary_embeddings(hidden_states, position_ids) + + attention_mask = self._update_full_mask( + attention_mask, + hidden_states, + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + layer_outputs = encoder_layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + **kwargs, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + encoder_states += (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + +class DiaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DiaDecoderConfig, layer_idx: int): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=True) + self.cross_attention = DiaCrossAttention(config, layer_idx) + self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.pre_ca_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.pre_mlp_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.mlp = DiaMLP(config) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + self_attn_cache = past_key_values + if isinstance(self_attn_cache, EncoderDecoderCache): + self_attn_cache = self_attn_cache.self_attention_cache + + residual = hidden_states + normed_states = self.pre_sa_norm(hidden_states) + self_attn_output, self_attn_weights = self.self_attention( + normed_states, + position_embeddings, + attention_mask, + # Needs to be an arg in order to function properly + # on inplace operations to be carried (e.g. compile) + self_attn_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + self_attn_output + + residual = hidden_states + normed_states = self.pre_ca_norm(hidden_states) + cross_states, cross_attn_weights = self.cross_attention( + normed_states, + encoder_hidden_states, + attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + hidden_states = residual + cross_states + + residual = hidden_states + normed_states = self.pre_mlp_norm(hidden_states) + mlp_out = self.mlp(normed_states) + hidden_states = residual + mlp_out + + return hidden_states, self_attn_weights, cross_attn_weights + + +class DiaDecoder(DiaPreTrainedModel): + """Transformer Decoder Stack using DenseGeneral.""" + + def __init__(self, config: DiaDecoderConfig): + super().__init__(config) + self.num_channels = config.num_channels + self.vocab_size = config.vocab_size + self.embeddings = DiaMultiChannelEmbedding(config) + self.rotary_embeddings = DiaRotaryEmbedding(config) + self.layers = nn.ModuleList( + [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`): + The original `decoder_input_ids` in 3D shape to facilitate more efficient computations. + + [What are input IDs?](../glossary#input-ids) + """ + + batch_size, seq_length = input_ids.size()[:-1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=input_ids.device + ) + if position_ids is None: + position_ids = cache_position[None, :] + + # RoPE + hidden_states = self.embeddings(input_ids) + position_embeddings = self.rotary_embeddings(hidden_states, position_ids) + + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=input_ids.device) + + attention_mask = create_causal_mask( + config=self.config, + input_embeds=hidden_states, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + hidden_states.shape[:2], + hidden_states, + ) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + position_embeddings, + attention_mask, + encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns = all_self_attns + (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask + + +@auto_docstring( + custom_intro=""" + The bare Dia model outputting raw hidden-states without any specific head on top. + """ +) +class DiaModel(DiaPreTrainedModel): + def __init__(self, config: DiaConfig): + super().__init__(config) + self.config = config + self.encoder = DiaEncoder(config.encoder_config) + self.decoder = DiaDecoder(config.decoder_config) + self.post_init() + + def get_encoder(self): + return self.encoder + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_position_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[tuple, Seq2SeqModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) + or (batch_size, target_sequence_length, num_codebooks)`, *optional*): + 1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where + the audio input codebooks are flattened into the batch dimension. This also aligns with the flat- + tened audio logits which are used to calculate the loss. + + 2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of + Dia to calculate embeddings and subsequent steps more efficiently. + + If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape + `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See + [`DiaProcessor.__call__`] for more details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`): + Indices of positions of each input sequence tokens in the position embeddings. + Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`. + + [What are position IDs?](../glossary#position-ids) + """ + + if input_ids is None and encoder_outputs is None: + raise ValueError( + "You should either provide text ids or the cached text encodings. Neither has been found." + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if self.is_gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + **kwargs, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput + elif not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # On default we initialize the decoder with bos tokens if nothing has been provided + bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels) + if decoder_input_ids is None: + decoder_input_ids = torch.full( + size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device + ) + # Ensure 3D + if decoder_input_ids.ndim == 2: + decoder_input_ids = decoder_input_ids.reshape(bsz, channels, seq_len).transpose(1, 2) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + position_ids=decoder_position_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs[0], + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top. + """ +) +class DiaForConditionalGeneration(DiaPreTrainedModel, DiaGenerationMixin): + base_model_prefix = "model" + + def __init__(self, config: DiaConfig): + super().__init__(config) + self.config = config + self.model = DiaModel(config) + + self.num_channels = config.decoder_config.num_channels + self.vocab_size = config.decoder_config.vocab_size + self.logits_dense = nn.Linear( + config.decoder_config.hidden_size, (self.num_channels * self.vocab_size), bias=False + ) + self.loss_type = "ForMaskedLM" + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_position_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[tuple, Seq2SeqLMOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) + or (batch_size, target_sequence_length, num_codebooks)`, *optional*): + 1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where + the audio input codebooks are flattened into the batch dimension. This also aligns with the flat- + tened audio logits which are used to calculate the loss. + + 2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of + Dia to calculate embeddings and subsequent steps more efficiently. + + If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape + `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See + [`DiaProcessor.__call__`] for more details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`): + Indices of positions of each input sequence tokens in the position embeddings. + Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`. + + [What are position IDs?](../glossary#position-ids) + labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in + `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100` + are ignored (masked). + """ + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_position_ids=decoder_position_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + last_hidden_state = outputs[0] + batch_size = last_hidden_state.shape[0] + # 3D <-> 2D makes it necessary to prioritize channel dim + audio_logits = ( + self.logits_dense(last_hidden_state) + .view((batch_size, -1, self.num_channels, self.vocab_size)) + .transpose(1, 2) + .contiguous() + .view(batch_size * self.num_channels, -1, self.vocab_size) + ) + + loss = None + if labels is not None: + loss = self.loss_function(logits=audio_logits, labels=labels, vocab_size=self.vocab_size, **kwargs) + + return Seq2SeqLMOutput( + loss=loss, + logits=audio_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +__all__ = ["DiaModel", "DiaPreTrainedModel", "DiaForConditionalGeneration"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modular_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modular_dia.py new file mode 100644 index 0000000000000000000000000000000000000000..f99d32a01d9cffab79bd72852d776447e084681b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modular_dia.py @@ -0,0 +1,773 @@ +# coding=utf-8 +# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Dia model.""" + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...cache_utils import DynamicCache, EncoderDecoderCache +from ...masking_utils import create_causal_mask +from ...modeling_attn_mask_utils import ( + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, +) +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, is_torchdynamo_compiling, logging +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaRMSNorm, + LlamaRotaryEmbedding, + eager_attention_forward, +) +from ..phi3.modeling_phi3 import Phi3MLP +from .configuration_dia import DiaConfig, DiaDecoderConfig, DiaEncoderConfig +from .generation_dia import DiaGenerationMixin + + +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +@auto_docstring +class DiaPreTrainedModel(PreTrainedModel): + config: DiaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = True + main_input_name = "input_ids" + _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"] + + +class DiaMultiChannelEmbedding(nn.Module): + """In order to efficiently compute the audio embedding from the 9 different channels, + we vectorize the embedding process by using a single embedding layer and an offset. + Example: + - num_embeds = 4 + - vocab_size = 8 + - num_channels = 3 + We would have offsets = [0, 8, 16] + If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8], + then tokens = audio_codes + offsets + = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24] + This allows us to use a single embedding layer for all channels. + """ + + def __init__(self, config: DiaDecoderConfig): + super().__init__() + self.embed = nn.Embedding(config.vocab_size * config.num_channels, config.hidden_size) + self.hidden_size = config.hidden_size + self.num_channels = config.num_channels + offsets = torch.arange(config.num_channels, dtype=torch.long) * config.vocab_size # (C,) + self.register_buffer("offsets", offsets, persistent=False) + + def forward(self, audio_codes: torch.Tensor) -> torch.Tensor: + tokens = (audio_codes + self.offsets.to(audio_codes.device)).squeeze(1) + embeds = self.embed(tokens).view(tokens.shape[0], audio_codes.shape[1], -1, self.hidden_size) + return embeds.sum(dim=2) + + +class DiaMLP(Phi3MLP): + pass + + +class DiaRMSNorm(LlamaRMSNorm): + pass + + +class DiaRotaryEmbedding(LlamaRotaryEmbedding): + pass + + +class DiaSelfAttention(LlamaAttention): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False): + nn.Module.__init__(self) + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.num_heads = self.config.num_attention_heads + self.num_key_value_heads = self.config.num_key_value_heads or self.num_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads) + self.scaling = 1 + self.attention_dropout = 0.0 + self.is_causal = is_causal + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + +class DiaCrossAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DiaDecoderConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.cross_hidden_size = config.cross_hidden_size + self.num_heads = self.config.cross_num_attention_heads + self.num_key_value_heads = self.config.cross_num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.head_dim = config.cross_head_dim + self.scaling = 1 + self.attention_dropout = 0.0 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + cross_attention_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + cross_shape = (*cross_attention_states.shape[:-1], -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False + if past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = past_key_values.cross_attention_cache.layers[self.layer_idx].keys + value_states = past_key_values.cross_attention_cache.layers[self.layer_idx].values + else: + key_states = self.k_proj(cross_attention_states).view(cross_shape).transpose(1, 2) + value_states = self.v_proj(cross_attention_states).view(cross_shape).transpose(1, 2) + + if past_key_values is not None: + # save all states to the cache + key_states, value_states = past_key_values.cross_attention_cache.update( + key_states, + value_states, + self.layer_idx, + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + past_key_values.is_updated[self.layer_idx] = True + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape((*input_shape, -1)).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DiaEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DiaEncoderConfig, layer_idx: int): + super().__init__() + self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=False) + self.post_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.mlp = DiaMLP(config) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + residual = hidden_states + normed_states = self.pre_sa_norm(hidden_states) + self_attn_output, self_attn_weights = self.self_attention( + normed_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + **kwargs, + ) + hidden_states = residual + self_attn_output + + residual = hidden_states + normed_states = self.post_sa_norm(hidden_states) + mlp_out = self.mlp(normed_states) + hidden_states = residual + mlp_out + + return hidden_states, self_attn_weights + + +class DiaEncoder(DiaPreTrainedModel): + def __init__(self, config: DiaEncoderConfig): + super().__init__(config) + self.config = config + + self.embedding = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList( + [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.rotary_embeddings = DiaRotaryEmbedding(config) + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[BaseModelOutput, tuple]: + hidden_states = self.embedding(input_ids) + + # RoPE + # Note: We expect right padding and hence always generate + # the position ids on the fly to reduce preparation overhead + position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :] + position_embeddings = self.rotary_embeddings(hidden_states, position_ids) + + attention_mask = self._update_full_mask( + attention_mask, + hidden_states, + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + layer_outputs = encoder_layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + **kwargs, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + encoder_states += (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + +class DiaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DiaDecoderConfig, layer_idx: int): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=True) + self.cross_attention = DiaCrossAttention(config, layer_idx) + self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.pre_ca_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.pre_mlp_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.mlp = DiaMLP(config) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + self_attn_cache = past_key_values + if isinstance(self_attn_cache, EncoderDecoderCache): + self_attn_cache = self_attn_cache.self_attention_cache + + residual = hidden_states + normed_states = self.pre_sa_norm(hidden_states) + self_attn_output, self_attn_weights = self.self_attention( + normed_states, + position_embeddings, + attention_mask, + # Needs to be an arg in order to function properly + # on inplace operations to be carried (e.g. compile) + self_attn_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + self_attn_output + + residual = hidden_states + normed_states = self.pre_ca_norm(hidden_states) + cross_states, cross_attn_weights = self.cross_attention( + normed_states, + encoder_hidden_states, + attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + hidden_states = residual + cross_states + + residual = hidden_states + normed_states = self.pre_mlp_norm(hidden_states) + mlp_out = self.mlp(normed_states) + hidden_states = residual + mlp_out + + return hidden_states, self_attn_weights, cross_attn_weights + + +class DiaDecoder(DiaPreTrainedModel): + """Transformer Decoder Stack using DenseGeneral.""" + + def __init__(self, config: DiaDecoderConfig): + super().__init__(config) + self.num_channels = config.num_channels + self.vocab_size = config.vocab_size + self.embeddings = DiaMultiChannelEmbedding(config) + self.rotary_embeddings = DiaRotaryEmbedding(config) + self.layers = nn.ModuleList( + [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`): + The original `decoder_input_ids` in 3D shape to facilitate more efficient computations. + + [What are input IDs?](../glossary#input-ids) + """ + + batch_size, seq_length = input_ids.size()[:-1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=input_ids.device + ) + if position_ids is None: + position_ids = cache_position[None, :] + + # RoPE + hidden_states = self.embeddings(input_ids) + position_embeddings = self.rotary_embeddings(hidden_states, position_ids) + + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=input_ids.device) + + attention_mask = create_causal_mask( + config=self.config, + input_embeds=hidden_states, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + hidden_states.shape[:2], + hidden_states, + ) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + position_embeddings, + attention_mask, + encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns = all_self_attns + (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask + + +@auto_docstring( + custom_intro=""" + The bare Dia model outputting raw hidden-states without any specific head on top. + """ +) +class DiaModel(DiaPreTrainedModel): + def __init__(self, config: DiaConfig): + super().__init__(config) + self.config = config + self.encoder = DiaEncoder(config.encoder_config) + self.decoder = DiaDecoder(config.decoder_config) + self.post_init() + + def get_encoder(self): + return self.encoder + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_position_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[tuple, Seq2SeqModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) + or (batch_size, target_sequence_length, num_codebooks)`, *optional*): + 1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where + the audio input codebooks are flattened into the batch dimension. This also aligns with the flat- + tened audio logits which are used to calculate the loss. + + 2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of + Dia to calculate embeddings and subsequent steps more efficiently. + + If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape + `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See + [`DiaProcessor.__call__`] for more details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`): + Indices of positions of each input sequence tokens in the position embeddings. + Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`. + + [What are position IDs?](../glossary#position-ids) + """ + + if input_ids is None and encoder_outputs is None: + raise ValueError( + "You should either provide text ids or the cached text encodings. Neither has been found." + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if self.is_gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + **kwargs, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput + elif not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # On default we initialize the decoder with bos tokens if nothing has been provided + bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels) + if decoder_input_ids is None: + decoder_input_ids = torch.full( + size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device + ) + # Ensure 3D + if decoder_input_ids.ndim == 2: + decoder_input_ids = decoder_input_ids.reshape(bsz, channels, seq_len).transpose(1, 2) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + position_ids=decoder_position_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs[0], + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top. + """ +) +class DiaForConditionalGeneration(DiaPreTrainedModel, DiaGenerationMixin): + base_model_prefix = "model" + + def __init__(self, config: DiaConfig): + super().__init__(config) + self.config = config + self.model = DiaModel(config) + + self.num_channels = config.decoder_config.num_channels + self.vocab_size = config.decoder_config.vocab_size + self.logits_dense = nn.Linear( + config.decoder_config.hidden_size, (self.num_channels * self.vocab_size), bias=False + ) + self.loss_type = "ForMaskedLM" + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_position_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None, + past_key_values: Optional[EncoderDecoderCache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Union[tuple, Seq2SeqLMOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) + or (batch_size, target_sequence_length, num_codebooks)`, *optional*): + 1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where + the audio input codebooks are flattened into the batch dimension. This also aligns with the flat- + tened audio logits which are used to calculate the loss. + + 2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of + Dia to calculate embeddings and subsequent steps more efficiently. + + If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape + `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See + [`DiaProcessor.__call__`] for more details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`): + Indices of positions of each input sequence tokens in the position embeddings. + Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`. + + [What are position IDs?](../glossary#position-ids) + labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in + `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100` + are ignored (masked). + """ + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_position_ids=decoder_position_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + last_hidden_state = outputs[0] + batch_size = last_hidden_state.shape[0] + # 3D <-> 2D makes it necessary to prioritize channel dim + audio_logits = ( + self.logits_dense(last_hidden_state) + .view((batch_size, -1, self.num_channels, self.vocab_size)) + .transpose(1, 2) + .contiguous() + .view(batch_size * self.num_channels, -1, self.vocab_size) + ) + + loss = None + if labels is not None: + loss = self.loss_function(logits=audio_logits, labels=labels, vocab_size=self.vocab_size, **kwargs) + + return Seq2SeqLMOutput( + loss=loss, + logits=audio_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +__all__ = ["DiaModel", "DiaPreTrainedModel", "DiaForConditionalGeneration"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.py new file mode 100644 index 0000000000000000000000000000000000000000..402f5152a64bda378ccdf5edd512c86fe643145c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.py @@ -0,0 +1,474 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Processor class for Dia""" + +import math +from pathlib import Path +from typing import Optional, Union + +from ...audio_utils import AudioInput, make_list_of_audio +from ...feature_extraction_utils import BatchFeature +from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...utils import is_soundfile_available, is_torch_available + + +if is_torch_available(): + import torch + +if is_soundfile_available(): + import soundfile as sf + + +class DiaAudioKwargs(AudioKwargs, total=False): + bos_token_id: int + eos_token_id: int + pad_token_id: int + delay_pattern: list[int] + generation: bool + + +class DiaProcessorKwargs(ProcessingKwargs, total=False): + audio_kwargs: DiaAudioKwargs + _defaults = { + "text_kwargs": { + "padding": True, + "padding_side": "right", + "add_special_tokens": False, + }, + "audio_kwargs": { + "eos_token_id": 1024, + "pad_token_id": 1025, + "bos_token_id": 1026, + "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15], + "generation": True, + "sampling_rate": 44100, + }, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DiaProcessor(ProcessorMixin): + r""" + Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into + a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio- + nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more + information. + + Args: + feature_extractor (`DiaFeatureExtractor`): + An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input. + tokenizer (`DiaTokenizer`): + An instance of [`DiaTokenizer`]. The tokenizer is a required input. + audio_tokenizer (`DacModel`): + An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input. + """ + + feature_extractor_class = "DiaFeatureExtractor" + tokenizer_class = "DiaTokenizer" + audio_tokenizer_class = "DacModel" + + def __init__(self, feature_extractor, tokenizer, audio_tokenizer): + super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer) + + def __call__( + self, + text: Union[str, list[str]], + audio: Optional[AudioInput] = None, + output_labels: Optional[bool] = False, + **kwargs: Unpack[DiaProcessorKwargs], + ): + """ + Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is + forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the + DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer + to the docstring of the above methods for more information. + """ + if not is_torch_available(): + raise ValueError( + "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't " + "find it in your environment. You can install torch via `pip install torch`." + ) + + if text is None: + raise ValueError("You need to specify the `text` input to process.") + + output_kwargs = self._merge_kwargs( + DiaProcessorKwargs, + **kwargs, + ) + + text_kwargs = output_kwargs["text_kwargs"] + audio_kwargs = output_kwargs["audio_kwargs"] + common_kwargs = output_kwargs["common_kwargs"] + + return_tensors = common_kwargs.pop("return_tensors", None) + if return_tensors != "pt": + raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") + + data = {} + + # Text + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + encodings = self.tokenizer(text, **text_kwargs) + data.update(encodings) + + # Audio + delay_pattern = audio_kwargs.pop("delay_pattern", None) + audio_bos_token_id = audio_kwargs.pop("bos_token_id", None) + audio_eos_token_id = audio_kwargs.pop("eos_token_id", None) + audio_pad_token_id = audio_kwargs.pop("pad_token_id", None) + generation = audio_kwargs.pop("generation", True) + if ( + audio_bos_token_id is None + or audio_eos_token_id is None + or audio_pad_token_id is None + or delay_pattern is None + ): + raise ValueError( + "To enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, " + "`pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those." + ) + + if generation and output_labels: + raise ValueError( + f"Labels with `generation` is incompatible, got generation={generation}, output_labels={output_labels}." + ) + + batch_size = data["input_ids"].shape[0] + num_channels = len(delay_pattern) + max_delay = max(delay_pattern) + + # Voice cloning generation / general training + if audio is not None: + audio = make_list_of_audio(audio) + input_audios = self.feature_extractor(audio, **audio_kwargs) + + compression_rate = math.prod(self.audio_tokenizer.config.downsampling_ratios) + max_encoded_sequence_len = input_audios["padding_mask"][0].shape[-1] // compression_rate + + decoder_input_ids = [] + decoder_attention_mask = [] + # TODO: dac with batching is currently broken, but non-batch is working + # refer to https://gist.github.com/vasqu/643a45b680cf39fd7467271ee2eb6f80 for a validation script + for padding_mask, audio in zip(input_audios["padding_mask"], input_audios["input_values"]): + # get current length with hop length in mind (as if it were sampled as a single audio) + base_pad_len = self.feature_extractor.hop_length + current_audio_len = math.ceil(padding_mask.sum(dim=-1) / base_pad_len) * base_pad_len + + encoded_sequence_len = current_audio_len // compression_rate + padding_len = max_encoded_sequence_len - encoded_sequence_len + + # compute non-padded forward pass; one extra bos (and eos if training) is added + with torch.no_grad(): + audio = audio[None, ..., :current_audio_len].to(self.audio_tokenizer.device) + input_ids = self.audio_tokenizer.encode(audio).audio_codes.transpose(1, 2) + + if not generation: + input_ids = torch.nn.functional.pad( + input_ids, pad=(0, 0, 0, 1, 0, 0), mode="constant", value=audio_eos_token_id + ) + + # apply padding + # +1 for the bos within the real sequence + input_ids = torch.nn.functional.pad( + input_ids, pad=(0, 0, padding_len + 1, 0, 0, 0), mode="constant", value=audio_bos_token_id + ) + num_valid_inputs = encoded_sequence_len + 1 + max_delay # sequence + bos + delay + num_valid_inputs += 0 if generation else 1 # eos if training + attention_mask = torch.tensor([0] * padding_len + [1] * num_valid_inputs, dtype=torch.long)[None, :] + + decoder_input_ids.append(input_ids) + decoder_attention_mask.append(attention_mask) + + decoder_input_ids = torch.cat(decoder_input_ids, dim=0) + decoder_attention_mask = torch.cat(decoder_attention_mask, dim=0) + # TTS generation + elif generation: + # all bos to start with TTS + decoder_input_ids = torch.full((batch_size, 1, num_channels), audio_bos_token_id, dtype=torch.long) + + # we preemptively add the delay + decoder_attention_mask = torch.ones(size=(batch_size, 1 + max_delay), dtype=torch.long) + else: + raise ValueError("If you try to train, you should provide audio data as well.") + + if batch_size != decoder_input_ids.shape[0]: + raise ValueError( + f"Need the same amount of samples for both text and audio, but got text samples={batch_size} and " + f"audio samples = {decoder_input_ids.shape[0]} instead." + ) + + # prepare shift indices per delay + max_seq_len = decoder_attention_mask.shape[-1] + max_audio_len = max_seq_len - max_delay + precomputed_idx = self.build_indices( + bsz=batch_size, + seq_len=max_seq_len, + num_channels=num_channels, + delay_pattern=delay_pattern, + revert=False, + ) + + # create delay pattern input + # the pad token will be used for masking which input is valid for prediction during generation + prefill = torch.full( + (batch_size, max_seq_len, num_channels), + fill_value=audio_pad_token_id, + dtype=torch.int, + ) + prefill[:, :max_audio_len] = decoder_input_ids + + delayed_decoder_input_ids = self.apply_audio_delay( + audio=prefill, + pad_token_id=audio_pad_token_id, + bos_token_id=audio_bos_token_id, + precomputed_idx=precomputed_idx, + ) + + data.update({"decoder_input_ids": delayed_decoder_input_ids, "decoder_attention_mask": decoder_attention_mask}) + + if output_labels: + # Base idea is to shift on the sequence dim + labels = data["decoder_input_ids"].clone()[:, 1:] + labels[labels == audio_pad_token_id] = -100 + labels[labels == audio_bos_token_id] = -100 + + data["labels"] = labels.transpose(1, 2).reshape(batch_size * num_channels, -1).contiguous().long() + data["decoder_input_ids"] = data["decoder_input_ids"][:, :-1] + data["decoder_attention_mask"] = data["decoder_attention_mask"][:, :-1] + + return BatchFeature(data=data, tensor_type=return_tensors) + + def batch_decode( + self, + decoder_input_ids: "torch.Tensor", + audio_prompt_len: Optional[int] = None, + **kwargs: Unpack[DiaProcessorKwargs], + ) -> list["torch.Tensor"]: + """ + Decodes a batch of audio codebook sequences into their respective audio waveforms via the + `audio_tokenizer`. See [`~DacModel.decode`] for more information. + + Args: + decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder. + audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning). + """ + output_kwargs = self._merge_kwargs( + DiaProcessorKwargs, + **kwargs, + ) + audio_kwargs = output_kwargs["audio_kwargs"] + + delay_pattern = audio_kwargs.pop("delay_pattern", None) + audio_bos_token_id = audio_kwargs.pop("bos_token_id", None) + audio_pad_token_id = audio_kwargs.pop("pad_token_id", None) + if audio_bos_token_id is None or audio_pad_token_id is None or delay_pattern is None: + raise ValueError( + "To enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, " + "and `delay_pattern`. You may have accidentally overwritten one of those." + ) + + # either decode the whole audio sequence or only the generated parts + if audio_prompt_len is not None: + audio_prompt_len = torch.tensor(audio_prompt_len, device=decoder_input_ids.device, dtype=torch.long) + start_of_generation_idx = audio_prompt_len[None].expand(decoder_input_ids.shape[0]) + else: + start_of_generation_idx = (decoder_input_ids[:, :, 0] == audio_bos_token_id).sum(dim=-1) + # -1 for the eos token + end_of_generation_idx = ( + decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == audio_pad_token_id).sum(dim=-1) - 1 + ) + + # revert delay + bsz, seq_len, num_channels = decoder_input_ids.shape + precomputed_idx = self.build_indices( + bsz=bsz, + seq_len=seq_len, + num_channels=num_channels, + delay_pattern=delay_pattern, + revert=True, + ) + + output_sequences = self.apply_audio_delay( + audio=decoder_input_ids, + # We do not care about these values as we cut them out + # with `start_of_generation_idx` and `end_of_generation_idx` + pad_token_id=-1, + bos_token_id=-1, + precomputed_idx=precomputed_idx, + ).transpose(1, 2) + + # retrieve the correct sequences each + audios = [] + # TODO: see above, dac doesn't work in batches yet + with torch.no_grad(): + for i in range(start_of_generation_idx.shape[0]): + output_i = output_sequences[i, :, start_of_generation_idx[i] : end_of_generation_idx[i]][None, ...] + output_i = output_i.to(self.audio_tokenizer.device) + audio_i = self.audio_tokenizer.decode(audio_codes=output_i).audio_values.cpu().squeeze() + audios.append(audio_i) + + return audios + + def decode( + self, + decoder_input_ids: "torch.Tensor", + audio_prompt_len: Optional[int] = None, + **kwargs: Unpack[DiaProcessorKwargs], + ) -> "torch.Tensor": + """ + Decodes a single sequence of audio codebooks into the respective audio waveform via the + `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information. + """ + if decoder_input_ids.shape[0] != 1: + raise ValueError( + f"Expecting a single output to be decoded but received {decoder_input_ids.shape[0]} samples instead." + ) + + return self.batch_decode(decoder_input_ids, audio_prompt_len, **kwargs)[0] + + def get_audio_prompt_len( + self, + decoder_attention_mask: "torch.Tensor", + **kwargs: Unpack[DiaProcessorKwargs], + ) -> int: + """Utility function to get the audio prompt length.""" + output_kwargs = self._merge_kwargs( + DiaProcessorKwargs, + **kwargs, + ) + audio_kwargs = output_kwargs["audio_kwargs"] + + delay_pattern = audio_kwargs.pop("delay_pattern", None) + if delay_pattern is None: + raise ValueError( + "To enable the utility of retrieving the prompt length for Dia, we need the " + "`delay_pattern`. You may have accidentally overwritten this." + ) + return decoder_attention_mask.shape[1] - max(delay_pattern) + + # Copied from transformers.models.csm.processing_csm.CsmProcessor.save_audio with Csm->Dia + def save_audio( + self, + audio: AudioInput, + saving_path: Union[str, Path, list[Union[str, Path]]], + **kwargs: Unpack[DiaProcessorKwargs], + ): + # TODO: @eustlb, this should be in AudioProcessor + if not is_soundfile_available(): + raise ImportError("Please install `soundfile` to save audio files.") + + # ensure correct audio input + audio = make_list_of_audio(audio) + + # ensure correct saving path + if isinstance(saving_path, (str, Path)): + saving_path = [saving_path] + elif not (isinstance(saving_path, (list, tuple)) and all(isinstance(p, (str, Path)) for p in saving_path)): + raise ValueError("Invalid input path. Please provide a string, or a list of strings") + + if len(audio) != len(saving_path): + raise ValueError("The number of audio and saving paths must be the same") + + output_kwargs = self._merge_kwargs( + DiaProcessorKwargs, + **kwargs, + ) + audio_kwargs = output_kwargs["audio_kwargs"] + sampling_rate = audio_kwargs["sampling_rate"] + + for audio_value, p in zip(audio, saving_path): + if isinstance(audio_value, torch.Tensor): + audio_value = audio_value.cpu().float().numpy() + sf.write(p, audio_value, sampling_rate) + + @staticmethod + def build_indices( + bsz: int, + seq_len: int, + num_channels: int, + delay_pattern: list[int], + revert: bool = False, + ) -> tuple["torch.Tensor", "torch.Tensor"]: + """ + Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel] + or in[seq, channel] = out[seq + delay[channel], channel] if `revert`. + Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD. + """ + delay_array = torch.tensor(delay_pattern, dtype=torch.int32) + + # (0..seq_len-1) + sequence_idx = torch.arange(seq_len, dtype=torch.int32)[None, :].expand(bsz, seq_len)[..., None] + # + or - delay depending if we delay or revert the delay + if not revert: + sequence_idx = sequence_idx - delay_array[None, None, :] + else: + sequence_idx = sequence_idx + delay_array[None, None, :] + # if delay goes over the range we clamp back to valid values + valid_sequence_idx = torch.clamp(sequence_idx, 0, seq_len - 1) + + batch_idx = torch.arange(bsz, dtype=torch.int32)[:, None, None].expand(bsz, seq_len, num_channels) + channel_idx = torch.arange(num_channels, dtype=torch.int32)[None, None, :].expand(bsz, seq_len, num_channels) + + all_idx = torch.stack( + [batch_idx.reshape(-1), valid_sequence_idx.reshape(-1), channel_idx.reshape(-1)], + dim=1, + ).long() + + return sequence_idx, all_idx + + @staticmethod + def apply_audio_delay( + audio: "torch.Tensor", + pad_token_id: int, + bos_token_id: int, + precomputed_idx: tuple["torch.Tensor", "torch.Tensor"], + ) -> "torch.Tensor": + """ + Applies or reverts the delay pattern to batched audio tokens using precomputed indices, + inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len. + + Args: + audio: audio tokens of shape [bsz, seq_len, num_channels] + pad_token_id: the PAD token + bos_token_id: the BOS token + precomputed_idx: from `build_indices` + + Returns: + final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels] + """ + # Move everything to the same device + device = audio.device + sequence_idx, all_idx = precomputed_idx + sequence_idx = sequence_idx.to(device) + all_idx = all_idx.to(device) + + # Gather per precomputed indices + batch_idx, valid_sequence_idx, channel_idx = torch.unbind(all_idx, dim=-1) + gathered_audio = audio[batch_idx, valid_sequence_idx, channel_idx].view(audio.size()) + + # Mask according to negative sequence_idx => BOS; sequence_idx >= seq_len => PAD + mask_bos = sequence_idx < 0 + mask_pad = sequence_idx >= audio.shape[1] + final_audio = torch.where(mask_bos, bos_token_id, torch.where(mask_pad, pad_token_id, gathered_audio)) + + return final_audio + + +__all__ = ["DiaProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.py new file mode 100644 index 0000000000000000000000000000000000000000..4e205906ea709ee2c20f25b0bf6f4fa66ab1f4a4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.py @@ -0,0 +1,118 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for Dia.""" + +from typing import Optional + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DiaTokenizer(PreTrainedTokenizer): + """ + Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + max_length (`int`, *optional*, defaults to 1024): + The maximum length of the sequences when encoding. Sequences longer than this will be truncated. + offset (`int`, *optional*, defaults to 0): + The offset of the tokenizer. + """ + + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + pad_token: Optional[str] = "", + unk_token: Optional[str] = "", + max_length: Optional[int] = 1024, + offset: int = 0, + **kwargs, + ): + # We have no eos/bos tokens but allow padding -- no l/r strip as we treat them as tokens as well + pad_token = AddedToken(pad_token) if isinstance(pad_token, str) else pad_token + unk_token = AddedToken(unk_token) if isinstance(unk_token, str) else unk_token + + self._utf_vocab_size = 2**8 # utf is 8 bits + self._added_tokens_decoder = {0: pad_token, 1: AddedToken("[S1]"), 2: AddedToken("[S2]")} + self.offset = offset + super().__init__( + unk_token=unk_token, + pad_token=pad_token, + max_length=max_length, + **kwargs, + ) + + @property + def vocab_size(self): + return self._utf_vocab_size + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + tokens = [chr(i) for i in text.encode("utf-8")] + return tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + + if len(token) != 1: + token_id = None + else: + token_id = ord(token) + self.offset + + return token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = chr(index - self.offset) + return token + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + """Converts a sequence of tokens (string) in a single string.""" + bstring = b"" + for token in tokens: + if token in self.added_tokens_decoder: + added_token_obj = self.added_tokens_decoder[token] + tok_string = str(added_token_obj).encode("utf-8") + elif token in self.added_tokens_encoder: + tok_string = token.encode("utf-8") + else: + tok_string = token.encode("utf-8") # Assume general string token + bstring += tok_string + string = bstring.decode("utf-8", errors="ignore") + return string + + # No vocab file + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + return () + + +__all__ = ["DiaTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c162fce0a48bd164bd0e0a615b942ee4805a12aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_diffllama import * + from .modeling_diffllama import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/configuration_diffllama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/configuration_diffllama.py new file mode 100644 index 0000000000000000000000000000000000000000..210607271927ab2f3a7aa1ec1e874fb296c32a73 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/configuration_diffllama.py @@ -0,0 +1,199 @@ +# coding=utf-8 +# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on Llama implementations in this library and Microsoft's +# Differential Transformer implementations. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DiffLlama model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class DiffLlamaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults + will yield a similar configuration to that of the [kajuma/DiffLlama-0.3B-handcut](https://huggingface.co/kajuma/DiffLlama-0.3B-handcut). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the DiffLlama model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DiffLlamaModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 16): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'diffllama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'diffllama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'diffllama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'diffllama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + lambda_std_dev (`float`, *optional*, defaults to 0.1): + The standard deviation for initialization of parameter lambda in attention layer. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + + ```python + >>> from transformers import DiffLlamaModel, DiffLlamaConfig + + >>> # Initializing a DiffLlama diffllama-7b style configuration + >>> configuration = DiffLlamaConfig() + + >>> # Initializing a model from the diffllama-7b style configuration + >>> model = DiffLlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "diffllama" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=16, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + lambda_std_dev=0.1, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.lambda_std_dev = lambda_std_dev + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["DiffLlamaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modeling_diffllama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modeling_diffllama.py new file mode 100644 index 0000000000000000000000000000000000000000..094cc375057f71eb51644bf2b49c524613ed22e1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modeling_diffllama.py @@ -0,0 +1,767 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/diffllama/modular_diffllama.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_diffllama.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on Llama implementations in this library and Microsoft's +# Differential Transformer implementations. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask +from ...modeling_layers import ( + GenericForQuestionAnswering, + GenericForSequenceClassification, + GenericForTokenClassification, + GradientCheckpointingLayer, +) +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_diffllama import DiffLlamaConfig + + +logger = logging.get_logger(__name__) + + +class DiffLlamaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def lambda_init_fn(layer_idx): + return 0.8 - 0.6 * math.exp(-0.3 * layer_idx) + + +class DiffLlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads) + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + # under this are not used + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + + self.lambda_init = lambda_init_fn(layer_idx) + self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, target_len, _ = hidden_states.size() + q_len = target_len + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1) + value_states = value_states.repeat(1, 2, 1, 1) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_full = lambda_1 - lambda_2 + self.lambda_init + + attn_output = torch.matmul(attn_weights, value_states) + attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1) + + attn_output = attn_output1 - lambda_full * attn_output2 + attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DiffLlamaFlashAttention2(DiffLlamaAttention): + """ + DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> tuple[torch.Tensor, None]: + if isinstance(past_key_values, StaticCache): + raise ValueError( + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (DiffLlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + device_type = query_states.device.type if query_states.device.type != "mps" else "cpu" + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = ( + torch.get_autocast_dtype(device_type) + if hasattr(torch, "get_autocast_dtype") + else torch.get_autocast_gpu_dtype() + ) + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + value_states1, value_states2 = torch.chunk(value_states, 2, dim=2) + value_states1 = value_states1.repeat(1, 1, 2, 1) + value_states2 = value_states2.repeat(1, 1, 2, 1) + + attn_output1 = _flash_attention_forward( + query_states, + key_states, + value_states1, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_output2 = _flash_attention_forward( + query_states, + key_states, + value_states2, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_output = torch.cat([attn_output1, attn_output2], dim=-1) + attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2) + + lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_full = lambda_1 - lambda_2 + self.lambda_init + + attn_output = attn_output1 - lambda_full * attn_output2 + attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output) + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, None + + +class DiffLlamaSdpaAttention(DiffLlamaAttention): + """ + DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from DiffLlamaAttention.forward + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1) + value_states = value_states.repeat(1, 2, 1, 1) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = causal_mask is None and q_len > 1 + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1) + + lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_full = lambda_1 - lambda_2 + self.lambda_init + + attn_output = attn_output1 - lambda_full * attn_output2 + attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + return attn_output, None + + +@use_kernel_forward_from_hub("RMSNorm") +class DiffLlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + DiffLlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +DIFFLLAMA_ATTENTION_CLASSES = { + "eager": DiffLlamaAttention, + "flash_attention_2": DiffLlamaFlashAttention2, + "sdpa": DiffLlamaSdpaAttention, +} + + +class DiffLlamaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DiffLlamaConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + self.mlp = DiffLlamaMLP(config) + self.input_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class DiffLlamaPreTrainedModel(PreTrainedModel): + config: DiffLlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["DiffLlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = False + + _can_compile_fullgraph = True + _supports_attention_backend = False + _can_record_outputs = { + "hidden_states": DiffLlamaDecoderLayer, + "attentions": DiffLlamaAttention, + } + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, DiffLlamaAttention): + module.lambda_q1.data.normal_(0, self.config.lambda_std_dev) + module.lambda_k1.data.normal_(0, self.config.lambda_std_dev) + module.lambda_q2.data.normal_(0, self.config.lambda_std_dev) + module.lambda_k2.data.normal_(0, self.config.lambda_std_dev) + + +class DiffLlamaRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: DiffLlamaConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class DiffLlamaModel(DiffLlamaPreTrainedModel): + def __init__(self, config: DiffLlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [DiffLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = DiffLlamaRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = DiffLlamaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, DiffLlamaForCausalLM + + >>> model = DiffLlamaForCausalLM.from_pretrained("google/diffllama-7b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/diffllama-7b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class DiffLlamaForSequenceClassification(GenericForSequenceClassification, DiffLlamaPreTrainedModel): + pass + + +class DiffLlamaForQuestionAnswering(GenericForQuestionAnswering, DiffLlamaPreTrainedModel): + base_model_prefix = "transformer" # For BC, where `transformer` was used instead of `model` + + +class DiffLlamaForTokenClassification(GenericForTokenClassification, DiffLlamaPreTrainedModel): + pass + + +__all__ = [ + "DiffLlamaPreTrainedModel", + "DiffLlamaModel", + "DiffLlamaForCausalLM", + "DiffLlamaForSequenceClassification", + "DiffLlamaForQuestionAnswering", + "DiffLlamaForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.py new file mode 100644 index 0000000000000000000000000000000000000000..253b99edff0d7a557da404fa680ce8403e22ccf1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.py @@ -0,0 +1,447 @@ +# coding=utf-8 +# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on Llama implementations in this library and Microsoft's +# Differential Transformer implementations. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Optional + +import torch +from torch import nn + +from ...cache_utils import Cache, StaticCache +from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask +from ...modeling_utils import PreTrainedModel +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg +from ..gemma.modeling_gemma import GemmaForCausalLM +from ..llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaForQuestionAnswering, + LlamaForSequenceClassification, + LlamaForTokenClassification, + LlamaModel, + LlamaPreTrainedModel, + apply_rotary_pos_emb, + repeat_kv, +) +from ..mistral.modeling_mistral import MistralMLP +from .configuration_diffllama import DiffLlamaConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut" +_CONFIG_FOR_DOC = "DiffLlamaConfig" + + +class DiffLlamaMLP(MistralMLP): + pass + + +def lambda_init_fn(layer_idx): + return 0.8 - 0.6 * math.exp(-0.3 * layer_idx) + + +class DiffLlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads) + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + # under this are not used + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + + self.lambda_init = lambda_init_fn(layer_idx) + self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,))) + self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, target_len, _ = hidden_states.size() + q_len = target_len + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1) + value_states = value_states.repeat(1, 2, 1, 1) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_full = lambda_1 - lambda_2 + self.lambda_init + + attn_output = torch.matmul(attn_weights, value_states) + attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1) + + attn_output = attn_output1 - lambda_full * attn_output2 + attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DiffLlamaFlashAttention2(DiffLlamaAttention): + """ + DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> tuple[torch.Tensor, None]: + if isinstance(past_key_values, StaticCache): + raise ValueError( + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (DiffLlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + device_type = query_states.device.type if query_states.device.type != "mps" else "cpu" + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = ( + torch.get_autocast_dtype(device_type) + if hasattr(torch, "get_autocast_dtype") + else torch.get_autocast_gpu_dtype() + ) + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + value_states1, value_states2 = torch.chunk(value_states, 2, dim=2) + value_states1 = value_states1.repeat(1, 1, 2, 1) + value_states2 = value_states2.repeat(1, 1, 2, 1) + + attn_output1 = _flash_attention_forward( + query_states, + key_states, + value_states1, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_output2 = _flash_attention_forward( + query_states, + key_states, + value_states2, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_output = torch.cat([attn_output1, attn_output2], dim=-1) + attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2) + + lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_full = lambda_1 - lambda_2 + self.lambda_init + + attn_output = attn_output1 - lambda_full * attn_output2 + attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output) + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, None + + +class DiffLlamaSdpaAttention(DiffLlamaAttention): + """ + DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from DiffLlamaAttention.forward + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1) + value_states = value_states.repeat(1, 2, 1, 1) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = causal_mask is None and q_len > 1 + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1) + + lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to( + query_states.dtype + ) + lambda_full = lambda_1 - lambda_2 + self.lambda_init + + attn_output = attn_output1 - lambda_full * attn_output2 + attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + return attn_output, None + + +DIFFLLAMA_ATTENTION_CLASSES = { + "eager": DiffLlamaAttention, + "flash_attention_2": DiffLlamaFlashAttention2, + "sdpa": DiffLlamaSdpaAttention, +} + + +class DiffLlamaDecoderLayer(LlamaDecoderLayer): + def __init__(self, config: DiffLlamaConfig, layer_idx: int): + super().__init__(config, layer_idx) + + self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + +class DiffLlamaPreTrainedModel(LlamaPreTrainedModel): + _supports_flex_attn = False + _supports_attention_backend = False + + def _init_weights(self, module): + PreTrainedModel._init_weights(self, module) + if isinstance(module, DiffLlamaAttention): + module.lambda_q1.data.normal_(0, self.config.lambda_std_dev) + module.lambda_k1.data.normal_(0, self.config.lambda_std_dev) + module.lambda_q2.data.normal_(0, self.config.lambda_std_dev) + module.lambda_k2.data.normal_(0, self.config.lambda_std_dev) + + +class DiffLlamaModel(LlamaModel): + pass + + +class DiffLlamaForCausalLM(GemmaForCausalLM): + pass + + +class DiffLlamaForSequenceClassification(LlamaForSequenceClassification): + pass + + +class DiffLlamaForQuestionAnswering(LlamaForQuestionAnswering): + pass + + +class DiffLlamaForTokenClassification(LlamaForTokenClassification): + pass + + +__all__ = [ + "DiffLlamaPreTrainedModel", + "DiffLlamaModel", + "DiffLlamaForCausalLM", + "DiffLlamaForSequenceClassification", + "DiffLlamaForQuestionAnswering", + "DiffLlamaForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b64cdbb3c7eb0467f6112225b8c0d9e1f65f9e99 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dinat import * + from .modeling_dinat import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbe30443f29195d178c9668f72978179c9f285b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9253c61b9c513defb5fde11579c8216b5d3d39a6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..789f73cec864be698526ecda3173b7d2136ca2fe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/configuration_dinat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/configuration_dinat.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d7fa509c5a3b2f5efc3b936cf1761b4ab0e107 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/configuration_dinat.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dilated Neighborhood Attention Transformer model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class DinatConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Dinat + [shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embed_dim (`int`, *optional*, defaults to 64): + Dimensionality of patch embedding. + depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 5]`): + Number of layers in each level of the encoder. + num_heads (`list[int]`, *optional*, defaults to `[2, 4, 8, 16]`): + Number of attention heads in each layer of the Transformer encoder. + kernel_size (`int`, *optional*, defaults to 7): + Neighborhood Attention kernel size. + dilations (`list[list[int]]`, *optional*, defaults to `[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]]`): + Dilation value of each NA layer in the Transformer encoder. + mlp_ratio (`float`, *optional*, defaults to 3.0): + Ratio of MLP hidden dimensionality to embedding dimensionality. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not a learnable bias should be added to the queries, keys and values. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + layer_scale_init_value (`float`, *optional*, defaults to 0.0): + The initial value for the layer scale. Disabled if <=0. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + + Example: + + ```python + >>> from transformers import DinatConfig, DinatModel + + >>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration + >>> configuration = DinatConfig() + + >>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration + >>> model = DinatModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dinat" + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + patch_size=4, + num_channels=3, + embed_dim=64, + depths=[3, 4, 6, 5], + num_heads=[2, 4, 8, 16], + kernel_size=7, + dilations=[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]], + mlp_ratio=3.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + initializer_range=0.02, + layer_norm_eps=1e-5, + layer_scale_init_value=0.0, + out_features=None, + out_indices=None, + **kwargs, + ): + super().__init__(**kwargs) + + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_layers = len(depths) + self.num_heads = num_heads + self.kernel_size = kernel_size + self.dilations = dilations + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) + self.layer_scale_init_value = layer_scale_init_value + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + + +__all__ = ["DinatConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/modeling_dinat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/modeling_dinat.py new file mode 100644 index 0000000000000000000000000000000000000000..4b7ec37b0ea8489e5db87df22c1876fd4548fe86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/modeling_dinat.py @@ -0,0 +1,855 @@ +# coding=utf-8 +# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Dilated Neighborhood Attention Transformer model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import BackboneOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + OptionalDependencyNotAvailable, + auto_docstring, + is_natten_available, + logging, + requires_backends, +) +from ...utils.backbone_utils import BackboneMixin +from .configuration_dinat import DinatConfig + + +if is_natten_available(): + from natten.functional import natten2dav, natten2dqkrpb +else: + + def natten2dqkrpb(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + def natten2dav(*args, **kwargs): + raise OptionalDependencyNotAvailable() + + +logger = logging.get_logger(__name__) + + +# drop_path and DinatDropPath are from the timm library. + + +@dataclass +@auto_docstring( + custom_intro=""" + Dinat encoder's outputs, with potential hidden states and attentions. + """ +) +class DinatEncoderOutput(ModelOutput): + r""" + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Dinat model's outputs that also contains a pooling of the last hidden states. + """ +) +class DinatModelOutput(ModelOutput): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): + Average pooling of the last layer hidden-state. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Dinat outputs for image classification. + """ +) +class DinatImageClassifierOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +class DinatEmbeddings(nn.Module): + """ + Construct the patch and position embeddings. + """ + + def __init__(self, config): + super().__init__() + + self.patch_embeddings = DinatPatchEmbeddings(config) + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor]: + embeddings = self.patch_embeddings(pixel_values) + embeddings = self.norm(embeddings) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class DinatPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + patch_size = config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + self.num_channels = num_channels + + if patch_size == 4: + pass + else: + # TODO: Support arbitrary patch sizes. + raise ValueError("Dinat only supports patch size of 4 at the moment.") + + self.projection = nn.Sequential( + nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + ) + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor: + _, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.projection(pixel_values) + embeddings = embeddings.permute(0, 2, 3, 1) + + return embeddings + + +class DinatDownsampler(nn.Module): + """ + Convolutional Downsampling Layer. + + Args: + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: + super().__init__() + self.dim = dim + self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, input_feature: torch.Tensor) -> torch.Tensor: + input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + input_feature = self.norm(input_feature) + return input_feature + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Dinat +class DinatDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class NeighborhoodAttention(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size, dilation): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.kernel_size = kernel_size + self.dilation = dilation + + # rpb is learnable relative positional biases; same concept is used Swin. + self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1))) + + self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + # Apply the scale factor before computing attention weights. It's usually more efficient because + # attention weights are typically a bigger tensor compared to query. + # It gives identical results because scalars are commutable in matrix multiplication. + query_layer = query_layer / math.sqrt(self.attention_head_size) + + # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, self.dilation) + context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class NeighborhoodAttentionOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class NeighborhoodAttentionModule(nn.Module): + def __init__(self, config, dim, num_heads, kernel_size, dilation): + super().__init__() + self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size, dilation) + self.output = NeighborhoodAttentionOutput(config, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self(hidden_states, output_attentions) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class DinatIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class DinatOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class DinatLayer(nn.Module): + def __init__(self, config, dim, num_heads, dilation, drop_path_rate=0.0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.kernel_size = config.kernel_size + self.dilation = dilation + self.window_size = self.kernel_size * self.dilation + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attention = NeighborhoodAttentionModule( + config, dim, num_heads, kernel_size=self.kernel_size, dilation=self.dilation + ) + self.drop_path = DinatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = DinatIntermediate(config, dim) + self.output = DinatOutput(config, dim) + self.layer_scale_parameters = ( + nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True) + if config.layer_scale_init_value > 0 + else None + ) + + def maybe_pad(self, hidden_states, height, width): + window_size = self.window_size + pad_values = (0, 0, 0, 0, 0, 0) + if height < window_size or width < window_size: + pad_l = pad_t = 0 + pad_r = max(0, window_size - width) + pad_b = max(0, window_size - height) + pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, height, width, channels = hidden_states.size() + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states) + # pad hidden_states if they are smaller than kernel size x dilation + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + + attention_outputs = self.attention(hidden_states, output_attentions=output_attentions) + + attention_output = attention_outputs[0] + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_output = attention_output[:, :height, :width, :].contiguous() + + if self.layer_scale_parameters is not None: + attention_output = self.layer_scale_parameters[0] * attention_output + + hidden_states = shortcut + self.drop_path(attention_output) + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.output(self.intermediate(layer_output)) + + if self.layer_scale_parameters is not None: + layer_output = self.layer_scale_parameters[1] * layer_output + + layer_output = hidden_states + self.drop_path(layer_output) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +class DinatStage(nn.Module): + def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, downsample): + super().__init__() + self.config = config + self.dim = dim + self.layers = nn.ModuleList( + [ + DinatLayer( + config=config, + dim=dim, + num_heads=num_heads, + dilation=dilations[i], + drop_path_rate=drop_path_rate[i], + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm) + else: + self.downsample = None + + self.pointing = False + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + _, height, width, _ = hidden_states.size() + for i, layer_module in enumerate(self.layers): + layer_outputs = layer_module(hidden_states, output_attentions) + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = hidden_states + if self.downsample is not None: + hidden_states = self.downsample(hidden_states_before_downsampling) + + stage_outputs = (hidden_states, hidden_states_before_downsampling) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +class DinatEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.num_levels = len(config.depths) + self.config = config + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] + self.levels = nn.ModuleList( + [ + DinatStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + dilations=config.dilations[i_layer], + drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=DinatDownsampler if (i_layer < self.num_levels - 1) else None, + ) + for i_layer in range(self.num_levels) + ] + ) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + output_hidden_states_before_downsampling: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple, DinatEncoderOutput]: + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.levels): + layer_outputs = layer_module(hidden_states, output_attentions) + + hidden_states = layer_outputs[0] + hidden_states_before_downsampling = layer_outputs[1] + + if output_hidden_states and output_hidden_states_before_downsampling: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states_before_downsampling,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + elif output_hidden_states and not output_hidden_states_before_downsampling: + # rearrange b h w c -> b c h w + reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[2:] + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return DinatEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +@auto_docstring +class DinatPreTrainedModel(PreTrainedModel): + config: DinatConfig + base_model_prefix = "dinat" + main_input_name = "pixel_values" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class DinatModel(DinatPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.config = config + self.num_levels = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1)) + + self.embeddings = DinatEmbeddings(config) + self.encoder = DinatEncoder(config) + + self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps) + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, DinatModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2)) + pooled_output = torch.flatten(pooled_output, 1) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + + return output + + return DinatModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state + of the [CLS] token) e.g. for ImageNet. + """ +) +class DinatForImageClassification(DinatPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + requires_backends(self, ["natten"]) + + self.num_labels = config.num_labels + self.dinat = DinatModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(self.dinat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, DinatImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.dinat( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DinatImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + NAT backbone, to be used with frameworks like DETR and MaskFormer. + """ +) +class DinatBackbone(DinatPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + requires_backends(self, ["natten"]) + + self.embeddings = DinatEmbeddings(config) + self.encoder = DinatEncoder(config) + self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))] + + # Add layer norms to hidden states of out_features + hidden_states_norms = {} + for stage, num_channels in zip(self._out_features, self.channels): + hidden_states_norms[stage] = nn.LayerNorm(num_channels) + self.hidden_states_norms = nn.ModuleDict(hidden_states_norms) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") + >>> model = AutoBackbone.from_pretrained( + ... "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 512, 7, 7] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=True, + output_hidden_states_before_downsampling=True, + return_dict=True, + ) + + hidden_states = outputs.reshaped_hidden_states + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + batch_size, num_channels, height, width = hidden_state.shape + hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous() + hidden_state = hidden_state.view(batch_size, height * width, num_channels) + hidden_state = self.hidden_states_norms[stage](hidden_state) + hidden_state = hidden_state.view(batch_size, height, width, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) + + if not return_dict: + output = (feature_maps,) + if output_hidden_states: + output += (outputs.hidden_states,) + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["DinatForImageClassification", "DinatModel", "DinatPreTrainedModel", "DinatBackbone"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48bb6f4ca9223757afd8cf7da0bd2c90bd371784 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..923ce38f539afb2ce60cb26c2fb3e0181bfdf50a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..975e54dd5792e9d440fb576f2cd3b27cf8255c91 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86285018bba522776f6c43710816293d0b60637e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2d10027b6a3b6375235a6785df044e8f0ce5fb33 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dinov2_with_registers import * + from .modeling_dinov2_with_registers import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..ec4f446fc684f40d634927c1e7a52b64c5732b12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py @@ -0,0 +1,159 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an + Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv2 with Registers + [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of register tokens to use. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel + + >>> # Initializing a Dinov2WithRegisters base style configuration + >>> configuration = Dinov2WithRegistersConfig() + + >>> # Initializing a model (with random weights) from the base style configuration + >>> model = Dinov2WithRegistersModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dinov2_with_registers" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + num_register_tokens=4, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.num_register_tokens = num_register_tokens + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + + +__all__ = ["Dinov2WithRegistersConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6f4c335f58b2ddd37bf4042d5f8e51c474cee9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py @@ -0,0 +1,712 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections.abc +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import TransformersKwargs, auto_docstring, torch_int +from ...utils.backbone_utils import BackboneMixin +from ...utils.generic import can_return_tuple, check_model_inputs +from .configuration_dinov2_with_registers import Dinov2WithRegistersConfig + + +class Dinov2WithRegistersPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class Dinov2WithRegistersEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, register tokens, position and patch embeddings. + """ + + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility + with the original implementation. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py + - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # Skip interpolation for matching dimensions (unless tracing) + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + # Handle class token and patch embeddings separately + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + + # Calculate new dimensions + height = height // self.config.patch_size + width = width // self.config.patch_size + + # Reshape for interpolation + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + # Store original dtype for restoration after interpolation + target_dtype = patch_pos_embed.dtype + + # Interpolate at float32 precision + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(dtype=torch.float32), + size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor + mode="bicubic", + align_corners=False, + antialias=True, + ).to(dtype=target_dtype) + + # Validate output dimensions if not tracing + if not torch.jit.is_tracing(): + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + + # Reshape back to original format + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + # Combine class and patch embeddings + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + # add register tokens + embeddings = torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class Dinov2WithRegistersSelfAttention(nn.Module): + def __init__(self, config: Dinov2WithRegistersConfig): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = hidden_states.shape[0] + new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size + + key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2) + query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + return context_layer, attention_probs + + +class Dinov2WithRegistersSelfOutput(nn.Module): + """ + The residual connection is defined in Dinov2WithRegistersLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: Dinov2WithRegistersConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class Dinov2WithRegistersAttention(nn.Module): + def __init__(self, config: Dinov2WithRegistersConfig): + super().__init__() + self.attention = Dinov2WithRegistersSelfAttention(config) + self.output = Dinov2WithRegistersSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + self_attn_output, _ = self.attention(hidden_states, head_mask) + output = self.output(self_attn_output, hidden_states) + return output + + +class Dinov2WithRegistersLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class Dinov2WithRegistersDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class Dinov2WithRegistersMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +class Dinov2WithRegistersSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class Dinov2WithRegistersLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = Dinov2WithRegistersAttention(config) + self.layer_scale1 = Dinov2WithRegistersLayerScale(config) + self.drop_path = ( + Dinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = Dinov2WithRegistersSwiGLUFFN(config) + else: + self.mlp = Dinov2WithRegistersMLP(config) + self.layer_scale2 = Dinov2WithRegistersLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + hidden_states_norm = self.norm1(hidden_states) + self_attention_output = self.attention(hidden_states_norm, head_mask) + self_attention_output = self.layer_scale1(self_attention_output) + + # first residual connection + hidden_states = self.drop_path(self_attention_output) + hidden_states + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + return layer_output + + +class Dinov2WithRegistersEncoder(nn.Module): + def __init__(self, config: Dinov2WithRegistersConfig): + super().__init__() + self.config = config + self.layer = nn.ModuleList([Dinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_hidden_states: bool = False + ) -> BaseModelOutput: + all_hidden_states = [hidden_states] if output_hidden_states else None + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module(hidden_states, layer_head_mask) + if all_hidden_states: + all_hidden_states.append(hidden_states) + + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=tuple(all_hidden_states) if all_hidden_states else None, + ) + + +@auto_docstring +class Dinov2WithRegistersPreTrainedModel(PreTrainedModel): + config: Dinov2WithRegistersConfig + base_model_prefix = "dinov2_with_registers" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["Dinov2WithRegistersLayer"] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "attentions": Dinov2WithRegistersSelfAttention, + } + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Dinov2WithRegistersEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + + module.mask_token.data.zero_() + module.register_tokens.data.zero_() + elif isinstance(module, Dinov2WithRegistersLayerScale): # noqa: F821 + module.lambda1.data.fill_(self.config.layerscale_value) + + +@auto_docstring +class Dinov2WithRegistersModel(Dinov2WithRegistersPreTrainedModel): + def __init__(self, config: Dinov2WithRegistersConfig): + super().__init__(config) + self.config = config + + self.embeddings = Dinov2WithRegistersEmbeddings(config) + self.encoder = Dinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + **kwargs, + ) -> BaseModelOutputWithPooling: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + """ + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + encoder_outputs: BaseModelOutput = self.encoder( + embedding_output, head_mask=head_mask, output_hidden_states=output_hidden_states + ) + sequence_output = encoder_outputs.last_hidden_state + sequence_output = self.layernorm(sequence_output) + pooled_output = sequence_output[:, 0, :] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state + of the [CLS] token) e.g. for ImageNet. + """ +) +class Dinov2WithRegistersForImageClassification(Dinov2WithRegistersPreTrainedModel): + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.dinov2_with_registers = Dinov2WithRegistersModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> ImageClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + outputs: BaseModelOutputWithPooling = self.dinov2_with_registers(pixel_values, head_mask=head_mask, **kwargs) + sequence_output = outputs.last_hidden_state # batch_size, sequence_length, hidden_size + + cls_token = sequence_output[:, 0] + # cls and register tokens should not be included in patch tokens variable + patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :] + + linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1) + logits = self.classifier(linear_input) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config, **kwargs) + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer. + """ +) +class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = Dinov2WithRegistersEmbeddings(config) + self.encoder = Dinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.num_register_tokens = config.num_register_tokens + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + @check_model_inputs + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + **kwargs, + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states + + embedding_output = self.embeddings(pixel_values) + output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True) + hidden_states = output.hidden_states + + feature_maps = [] + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, 1 + self.num_register_tokens :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps.append(hidden_state) + + return BackboneOutput( + feature_maps=tuple(feature_maps), + hidden_states=hidden_states if output_hidden_states else None, + ) + + +__all__ = [ + "Dinov2WithRegistersPreTrainedModel", + "Dinov2WithRegistersModel", + "Dinov2WithRegistersForImageClassification", + "Dinov2WithRegistersBackbone", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..686528002b09c9689d66a057ed55eb1a43b0d256 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py @@ -0,0 +1,435 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +from torch import nn + +from ....transformers.models.dinov2.modeling_dinov2 import ( + Dinov2Backbone, + Dinov2Encoder, + Dinov2ForImageClassification, + Dinov2Model, + Dinov2PatchEmbeddings, + Dinov2PreTrainedModel, +) +from ...configuration_utils import PretrainedConfig +from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, logging, torch_int +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an + Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv2 with Registers + [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of register tokens to use. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel + + >>> # Initializing a Dinov2WithRegisters base style configuration + >>> configuration = Dinov2WithRegistersConfig() + + >>> # Initializing a model (with random weights) from the base style configuration + >>> model = Dinov2WithRegistersModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dinov2_with_registers" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + num_register_tokens=4, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.num_register_tokens = num_register_tokens + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + + +class Dinov2WithRegistersPatchEmbeddings(Dinov2PatchEmbeddings): + pass + + +class Dinov2WithRegistersEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, register tokens, position and patch embeddings. + """ + + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility + with the original implementation. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py + - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # Skip interpolation for matching dimensions (unless tracing) + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + # Handle class token and patch embeddings separately + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + + # Calculate new dimensions + height = height // self.config.patch_size + width = width // self.config.patch_size + + # Reshape for interpolation + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + # Store original dtype for restoration after interpolation + target_dtype = patch_pos_embed.dtype + + # Interpolate at float32 precision + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(dtype=torch.float32), + size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor + mode="bicubic", + align_corners=False, + antialias=True, + ).to(dtype=target_dtype) + + # Validate output dimensions if not tracing + if not torch.jit.is_tracing(): + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + + # Reshape back to original format + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + # Combine class and patch embeddings + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + # add register tokens + embeddings = torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class Dinov2WithRegistersEncoder(Dinov2Encoder): + pass + + +class Dinov2WithRegistersPreTrainedModel(Dinov2PreTrainedModel): + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Dinov2WithRegistersEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + + module.mask_token.data.zero_() + module.register_tokens.data.zero_() + elif isinstance(module, Dinov2WithRegistersLayerScale): # noqa: F821 + module.lambda1.data.fill_(self.config.layerscale_value) + + +class Dinov2WithRegistersModel(Dinov2Model): + pass + + +class Dinov2WithRegistersForImageClassification(Dinov2ForImageClassification): + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> ImageClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + outputs: BaseModelOutputWithPooling = self.dinov2_with_registers(pixel_values, head_mask=head_mask, **kwargs) + sequence_output = outputs.last_hidden_state # batch_size, sequence_length, hidden_size + + cls_token = sequence_output[:, 0] + # cls and register tokens should not be included in patch tokens variable + patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :] + + linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1) + logits = self.classifier(linear_input) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config, **kwargs) + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class Dinov2WithRegistersBackbone(Dinov2Backbone): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + self.num_register_tokens = config.num_register_tokens + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = Dinov2WithRegistersEmbeddings(config) + self.encoder = Dinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + **kwargs, + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states + + embedding_output = self.embeddings(pixel_values) + output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True) + hidden_states = output.hidden_states + + feature_maps = [] + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, 1 + self.num_register_tokens :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps.append(hidden_state) + + return BackboneOutput( + feature_maps=tuple(feature_maps), + hidden_states=hidden_states if output_hidden_states else None, + ) + + +__all__ = [ + "Dinov2WithRegistersConfig", + "Dinov2WithRegistersPreTrainedModel", + "Dinov2WithRegistersModel", + "Dinov2WithRegistersForImageClassification", + "Dinov2WithRegistersBackbone", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..599b811d4b320ccb5e37de217903203234cb210e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92d3d510d394c16661ab288a691967d7fc9e97f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d20bd4055bdf368cd830a646ff8138553628f8b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f2e4d2b614f86cda26121d71c053efb2b14c5f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..644582dc354127eae90c97b70441e83e866a9db0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cff243f188e561abb1590c18b57bd3ca27c3288 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e5eb4f4d24c2a1de027096d615415d2b2179b62 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ec9e4c526880f439b96ba51b71e1a6cca4cde2b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c716d6dc82e478bfc4689ff6e08dea518c1677d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60b79af407650d33542b97524342ee0fba792ff8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acb7d0e68bcf65f2ff3ffd97412eef64b294bd56 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/configuration_edgetam.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/configuration_edgetam.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0b8970371b7bcce7b3f24c8074032adda7cf985 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/configuration_edgetam.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modeling_edgetam.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modeling_edgetam.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0a059a23eda779a10986b4835102c6b4ccbcad3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modeling_edgetam.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modular_edgetam.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modular_edgetam.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..054b910ec29bc99d72568c85b18c8b22ef935ad8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modular_edgetam.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0041ad10528d3476375e9c063642a8615fcd96cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/configuration_edgetam_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/configuration_edgetam_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..714257f6830885e281e2965f537db9314c482e58 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/configuration_edgetam_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/modular_edgetam_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/modular_edgetam_video.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64e81e52a3a867a3be66bb5a04fa4b4361de10e9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/modular_edgetam_video.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5ab3632346a8617f48ac7e4da81f906afa8e04b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1484c2d0363731fa03458eb00227ec60d523e3ef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..724667c4fde01b25406b0a304ec4eee34ac98f74 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81b29339028dcd953130b34ad2610a2c311bb02a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c95204eb37e7e2aefb3ca524ecef36c59f9ae43 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22c400874c8080e2a590bcd49ee5ecb8606c9d58 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b637f9cf4f167ceafc67bcccf46d18817038825 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..651fd220de7a9e6eed86bbde354be56f3113e3f7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..186107de88454a08029eafe4030f32618dcc645f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0c1ab9537cbb6ee96f34c85d849310070ac1d0b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a735fcee001a0d94215c7c101ecad523b4fee3fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py @@ -0,0 +1,398 @@ +# Copyright 2021 AlQuraishi Laboratory +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import math +from collections.abc import Iterable, Sequence +from functools import partial +from typing import Any, Callable, Optional, Union + +import torch + +from .tensor_utils import tensor_tree_map, tree_map + + +def _fetch_dims(tree: Union[dict, list, tuple, torch.Tensor]) -> list[tuple[int, ...]]: + shapes = [] + if isinstance(tree, dict): + for v in tree.values(): + shapes.extend(_fetch_dims(v)) + elif isinstance(tree, (list, tuple)): + for t in tree: + shapes.extend(_fetch_dims(t)) + elif isinstance(tree, torch.Tensor): + shapes.append(tree.shape) + else: + raise TypeError("Not supported") + + return shapes + + +@torch.jit.ignore +def _flat_idx_to_idx(flat_idx: int, dims: tuple[int, ...]) -> tuple[int, ...]: + idx = [] + for d in reversed(dims): + idx.append(flat_idx % d) + flat_idx = flat_idx // d + + return tuple(reversed(idx)) + + +@torch.jit.ignore +def _get_minimal_slice_set( + start: Sequence[int], + end: Sequence[int], + dims: Sequence[int], + start_edges: Optional[Sequence[bool]] = None, + end_edges: Optional[Sequence[bool]] = None, +) -> list[tuple[slice, ...]]: + """ + Produces an ordered sequence of tensor slices that, when used in sequence on a tensor with shape dims, yields + tensors that contain every leaf in the contiguous range [start, end]. Care is taken to yield a short sequence of + slices, and perhaps even the shortest possible (I'm pretty sure it's the latter). + + end is INCLUSIVE. + """ + + # start_edges and end_edges both indicate whether, starting from any given + # dimension, the start/end index is at the top/bottom edge of the + # corresponding tensor, modeled as a tree + def reduce_edge_list(l: list[bool]) -> None: + tally = True + for i in range(len(l)): + reversed_idx = -1 * (i + 1) + l[reversed_idx] &= tally + tally = l[reversed_idx] + + if start_edges is None: + start_edges = [s == 0 for s in start] + reduce_edge_list(start_edges) + if end_edges is None: + end_edges = [e == (d - 1) for e, d in zip(end, dims)] + reduce_edge_list(end_edges) + + # Base cases. Either start/end are empty and we're done, or the final, + # one-dimensional tensor can be simply sliced + if len(start) == 0: + return [()] + elif len(start) == 1: + return [(slice(start[0], end[0] + 1),)] + + slices: list[tuple[slice, ...]] = [] + path_list: list[slice] = [] + + # Dimensions common to start and end can be selected directly + for s, e in zip(start, end): + if s == e: + path_list.append(slice(s, s + 1)) + else: + break + + path: tuple[slice, ...] = tuple(path_list) + divergence_idx = len(path) + + # start == end, and we're done + if divergence_idx == len(dims): + return [path] + + def upper() -> tuple[tuple[slice, ...], ...]: + assert start_edges is not None + assert end_edges is not None + + sdi = start[divergence_idx] + return tuple( + path + (slice(sdi, sdi + 1),) + s + for s in _get_minimal_slice_set( + start[divergence_idx + 1 :], + [d - 1 for d in dims[divergence_idx + 1 :]], + dims[divergence_idx + 1 :], + start_edges=start_edges[divergence_idx + 1 :], + end_edges=[True for _ in end_edges[divergence_idx + 1 :]], + ) + ) + + def lower() -> tuple[tuple[slice, ...], ...]: + assert start_edges is not None + assert end_edges is not None + + edi = end[divergence_idx] + return tuple( + path + (slice(edi, edi + 1),) + s + for s in _get_minimal_slice_set( + [0 for _ in start[divergence_idx + 1 :]], + end[divergence_idx + 1 :], + dims[divergence_idx + 1 :], + start_edges=[True for _ in start_edges[divergence_idx + 1 :]], + end_edges=end_edges[divergence_idx + 1 :], + ) + ) + + # If both start and end are at the edges of the subtree rooted at + # divergence_idx, we can just select the whole subtree at once + if start_edges[divergence_idx] and end_edges[divergence_idx]: + slices.append(path + (slice(start[divergence_idx], end[divergence_idx] + 1),)) + # If just start is at the edge, we can grab almost all of the subtree, + # treating only the ragged bottom edge as an edge case + elif start_edges[divergence_idx]: + slices.append(path + (slice(start[divergence_idx], end[divergence_idx]),)) + slices.extend(lower()) + # Analogous to the previous case, but the top is ragged this time + elif end_edges[divergence_idx]: + slices.extend(upper()) + slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),)) + # If both sides of the range are ragged, we need to handle both sides + # separately. If there's contiguous meat in between them, we can index it + # in one big chunk + else: + slices.extend(upper()) + middle_ground = end[divergence_idx] - start[divergence_idx] + if middle_ground > 1: + slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx]),)) + slices.extend(lower()) + + return slices + + +@torch.jit.ignore +def _chunk_slice(t: torch.Tensor, flat_start: int, flat_end: int, no_batch_dims: int) -> torch.Tensor: + """ + Equivalent to + + t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end] + + but without the need for the initial reshape call, which can be memory-intensive in certain situations. The only + reshape operations in this function are performed on sub-tensors that scale with (flat_end - flat_start), the chunk + size. + """ + + batch_dims = t.shape[:no_batch_dims] + start_idx = list(_flat_idx_to_idx(flat_start, batch_dims)) + # _get_minimal_slice_set is inclusive + end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims)) + + # Get an ordered list of slices to perform + slices = _get_minimal_slice_set( + start_idx, + end_idx, + batch_dims, + ) + + sliced_tensors = [t[s] for s in slices] + + return torch.cat([s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors]) + + +def chunk_layer( + layer: Callable, + inputs: dict[str, Any], + chunk_size: int, + no_batch_dims: int, + low_mem: bool = False, + _out: Any = None, + _add_into_out: bool = False, +) -> Any: + """ + Implements the "chunking" procedure described in section 1.11.8. + + Layer outputs and inputs are assumed to be simple "pytrees," consisting only of (arbitrarily nested) lists, tuples, + and dicts with torch.Tensor leaves. + + Args: + layer: + The layer to be applied chunk-wise + inputs: + A (non-nested) dictionary of keyworded inputs. All leaves must be tensors and must share the same batch + dimensions. + chunk_size: + The number of sub-batches per chunk. If multiple batch dimensions are specified, a "sub-batch" is defined + as a single indexing of all batch dimensions simultaneously (s.t. the number of sub-batches is the product + of the batch dimensions). + no_batch_dims: + How many of the initial dimensions of each input tensor can be considered batch dimensions. + low_mem: + Avoids flattening potentially large input tensors. Unnecessary in most cases, and is ever so slightly + slower than the default setting. + Returns: + The reassembled output of the layer on the inputs. + """ + if not (len(inputs) > 0): + raise ValueError("Must provide at least one input") + + initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)] + orig_batch_dims = tuple(max(s) for s in zip(*initial_dims)) + + def _prep_inputs(t: torch.Tensor) -> torch.Tensor: + if not low_mem: + if sum(t.shape[:no_batch_dims]) != no_batch_dims: + t = t.expand(orig_batch_dims + t.shape[no_batch_dims:]) + t = t.reshape(-1, *t.shape[no_batch_dims:]) + else: + t = t.expand(orig_batch_dims + t.shape[no_batch_dims:]) + return t + + prepped_inputs: dict[str, Any] = tensor_tree_map(_prep_inputs, inputs) + prepped_outputs = None + if _out is not None: + prepped_outputs = tensor_tree_map(lambda t: t.view([-1] + list(t.shape[no_batch_dims:])), _out) + + flat_batch_dim = 1 + for d in orig_batch_dims: + flat_batch_dim *= d + + no_chunks = flat_batch_dim // chunk_size + (flat_batch_dim % chunk_size != 0) + + def _select_chunk(t: torch.Tensor) -> torch.Tensor: + return t[i : i + chunk_size] if t.shape[0] != 1 else t + + i = 0 + out = prepped_outputs + for _ in range(no_chunks): + # Chunk the input + if not low_mem: + select_chunk = _select_chunk + else: + select_chunk = partial( + _chunk_slice, + flat_start=i, + flat_end=min(flat_batch_dim, i + chunk_size), + no_batch_dims=len(orig_batch_dims), + ) + + chunks: dict[str, Any] = tensor_tree_map(select_chunk, prepped_inputs) + + # Run the layer on the chunk + output_chunk = layer(**chunks) + + # Allocate space for the output + if out is None: + out = tensor_tree_map(lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:]), output_chunk) + + # Put the chunk in its pre-allocated space + if isinstance(output_chunk, dict): + + def assign(d1: dict, d2: dict) -> None: + for k, v in d1.items(): + if isinstance(v, dict): + assign(v, d2[k]) + else: + if _add_into_out: + v[i : i + chunk_size] += d2[k] + else: + v[i : i + chunk_size] = d2[k] + + assign(out, output_chunk) + elif isinstance(output_chunk, tuple): + for x1, x2 in zip(out, output_chunk): + if _add_into_out: + x1[i : i + chunk_size] += x2 + else: + x1[i : i + chunk_size] = x2 + elif isinstance(output_chunk, torch.Tensor): + if _add_into_out: + out[i : i + chunk_size] += output_chunk + else: + out[i : i + chunk_size] = output_chunk + else: + raise TypeError("Not supported") + + i += chunk_size + + out = tensor_tree_map(lambda t: t.view(orig_batch_dims + t.shape[1:]), out) + + return out + + +class ChunkSizeTuner: + def __init__( + self, + # Heuristically, runtimes for most of the modules in the network + # plateau earlier than this on all GPUs I've run the model on. + max_chunk_size: int = 512, + ): + self.max_chunk_size = max_chunk_size + self.cached_chunk_size: Optional[int] = None + self.cached_arg_data: Optional[tuple] = None + + def _determine_favorable_chunk_size(self, fn: Callable, args: tuple, min_chunk_size: int) -> int: + logging.info("Tuning chunk size...") + + if min_chunk_size >= self.max_chunk_size: + return min_chunk_size + + candidates: list[int] = [2**l for l in range(int(math.log2(self.max_chunk_size)) + 1)] + candidates = [c for c in candidates if c > min_chunk_size] + candidates = [min_chunk_size] + candidates + candidates[-1] += 4 + + def test_chunk_size(chunk_size: int) -> bool: + try: + with torch.no_grad(): + fn(*args, chunk_size=chunk_size) + return True + except RuntimeError: + return False + + min_viable_chunk_size_index = 0 + i = len(candidates) - 1 + while i > min_viable_chunk_size_index: + viable = test_chunk_size(candidates[i]) + if not viable: + i = (min_viable_chunk_size_index + i) // 2 + else: + min_viable_chunk_size_index = i + i = (i + len(candidates) - 1) // 2 + + return candidates[min_viable_chunk_size_index] + + def _compare_arg_caches(self, ac1: Iterable, ac2: Iterable) -> bool: + consistent = True + for a1, a2 in zip(ac1, ac2): + assert type(ac1) is type(ac2) + if isinstance(ac1, (list, tuple)): + consistent &= self._compare_arg_caches(a1, a2) + elif isinstance(ac1, dict): + a1_items = [v for _, v in sorted(a1.items(), key=lambda x: x[0])] + a2_items = [v for _, v in sorted(a2.items(), key=lambda x: x[0])] + consistent &= self._compare_arg_caches(a1_items, a2_items) + else: + consistent &= a1 == a2 + + return consistent + + def tune_chunk_size( + self, + representative_fn: Callable, + args: tuple, + min_chunk_size: int, + ) -> int: + consistent = True + arg_data: tuple = tree_map(lambda a: a.shape if isinstance(a, torch.Tensor) else a, args, object) + if self.cached_arg_data is not None: + # If args have changed shape/value, we need to re-tune + assert len(self.cached_arg_data) == len(arg_data) + consistent = self._compare_arg_caches(self.cached_arg_data, arg_data) + else: + # Otherwise, we can reuse the precomputed value + consistent = False + + if not consistent: + self.cached_chunk_size = self._determine_favorable_chunk_size( + representative_fn, + args, + min_chunk_size, + ) + self.cached_arg_data = arg_data + + assert self.cached_chunk_size is not None + + return self.cached_chunk_size diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..e92e65d29bfb2e7a0ad640e71d983c1c467d3a54 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py @@ -0,0 +1,979 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constants used in AlphaFold.""" + +import collections +import copy +import functools +from collections.abc import Mapping, Sequence +from importlib import resources + +import numpy as np + + +# Internal import (35fd). + + +# Distance from one CA to next CA [trans configuration: omega = 180]. +ca_ca = 3.80209737096 + +# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in +# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have +# chi angles so their chi angle lists are empty. +chi_angles_atoms: dict[str, list[list[str]]] = { + "ALA": [], + # Chi5 in arginine is always 0 +- 5 degrees, so ignore it. + "ARG": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "NE"], ["CG", "CD", "NE", "CZ"]], + "ASN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]], + "ASP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]], + "CYS": [["N", "CA", "CB", "SG"]], + "GLN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]], + "GLU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]], + "GLY": [], + "HIS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "ND1"]], + "ILE": [["N", "CA", "CB", "CG1"], ["CA", "CB", "CG1", "CD1"]], + "LEU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "LYS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "CE"], ["CG", "CD", "CE", "NZ"]], + "MET": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "SD"], ["CB", "CG", "SD", "CE"]], + "PHE": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "PRO": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"]], + "SER": [["N", "CA", "CB", "OG"]], + "THR": [["N", "CA", "CB", "OG1"]], + "TRP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "TYR": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "VAL": [["N", "CA", "CB", "CG1"]], +} + +# If chi angles given in fixed-length array, this matrix determines how to mask +# them for each AA type. The order is as per restype_order (see below). +chi_angles_mask: list[list[float]] = [ + [0.0, 0.0, 0.0, 0.0], # ALA + [1.0, 1.0, 1.0, 1.0], # ARG + [1.0, 1.0, 0.0, 0.0], # ASN + [1.0, 1.0, 0.0, 0.0], # ASP + [1.0, 0.0, 0.0, 0.0], # CYS + [1.0, 1.0, 1.0, 0.0], # GLN + [1.0, 1.0, 1.0, 0.0], # GLU + [0.0, 0.0, 0.0, 0.0], # GLY + [1.0, 1.0, 0.0, 0.0], # HIS + [1.0, 1.0, 0.0, 0.0], # ILE + [1.0, 1.0, 0.0, 0.0], # LEU + [1.0, 1.0, 1.0, 1.0], # LYS + [1.0, 1.0, 1.0, 0.0], # MET + [1.0, 1.0, 0.0, 0.0], # PHE + [1.0, 1.0, 0.0, 0.0], # PRO + [1.0, 0.0, 0.0, 0.0], # SER + [1.0, 0.0, 0.0, 0.0], # THR + [1.0, 1.0, 0.0, 0.0], # TRP + [1.0, 1.0, 0.0, 0.0], # TYR + [1.0, 0.0, 0.0, 0.0], # VAL +] + +# The following chi angles are pi periodic: they can be rotated by a multiple +# of pi without affecting the structure. +chi_pi_periodic: list[list[float]] = [ + [0.0, 0.0, 0.0, 0.0], # ALA + [0.0, 0.0, 0.0, 0.0], # ARG + [0.0, 0.0, 0.0, 0.0], # ASN + [0.0, 1.0, 0.0, 0.0], # ASP + [0.0, 0.0, 0.0, 0.0], # CYS + [0.0, 0.0, 0.0, 0.0], # GLN + [0.0, 0.0, 1.0, 0.0], # GLU + [0.0, 0.0, 0.0, 0.0], # GLY + [0.0, 0.0, 0.0, 0.0], # HIS + [0.0, 0.0, 0.0, 0.0], # ILE + [0.0, 0.0, 0.0, 0.0], # LEU + [0.0, 0.0, 0.0, 0.0], # LYS + [0.0, 0.0, 0.0, 0.0], # MET + [0.0, 1.0, 0.0, 0.0], # PHE + [0.0, 0.0, 0.0, 0.0], # PRO + [0.0, 0.0, 0.0, 0.0], # SER + [0.0, 0.0, 0.0, 0.0], # THR + [0.0, 0.0, 0.0, 0.0], # TRP + [0.0, 1.0, 0.0, 0.0], # TYR + [0.0, 0.0, 0.0, 0.0], # VAL + [0.0, 0.0, 0.0, 0.0], # UNK +] + +# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi, +# psi and chi angles: +# 0: 'backbone group', +# 1: 'pre-omega-group', (empty) +# 2: 'phi-group', (currently empty, because it defines only hydrogens) +# 3: 'psi-group', +# 4,5,6,7: 'chi1,2,3,4-group' +# The atom positions are relative to the axis-end-atom of the corresponding +# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis +# is defined such that the dihedral-angle-definiting atom (the last entry in +# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate). +# format: [atomname, group_idx, rel_position] +rigid_group_atom_positions: dict[str, list[tuple[str, int, tuple[float, float, float]]]] = { + "ALA": [ + ("N", 0, (-0.525, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, -0.000, -0.000)), + ("CB", 0, (-0.529, -0.774, -1.205)), + ("O", 3, (0.627, 1.062, 0.000)), + ], + "ARG": [ + ("N", 0, (-0.524, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, -0.000)), + ("CB", 0, (-0.524, -0.778, -1.209)), + ("O", 3, (0.626, 1.062, 0.000)), + ("CG", 4, (0.616, 1.390, -0.000)), + ("CD", 5, (0.564, 1.414, 0.000)), + ("NE", 6, (0.539, 1.357, -0.000)), + ("NH1", 7, (0.206, 2.301, 0.000)), + ("NH2", 7, (2.078, 0.978, -0.000)), + ("CZ", 7, (0.758, 1.093, -0.000)), + ], + "ASN": [ + ("N", 0, (-0.536, 1.357, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, -0.000, -0.000)), + ("CB", 0, (-0.531, -0.787, -1.200)), + ("O", 3, (0.625, 1.062, 0.000)), + ("CG", 4, (0.584, 1.399, 0.000)), + ("ND2", 5, (0.593, -1.188, 0.001)), + ("OD1", 5, (0.633, 1.059, 0.000)), + ], + "ASP": [ + ("N", 0, (-0.525, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, 0.000, -0.000)), + ("CB", 0, (-0.526, -0.778, -1.208)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.593, 1.398, -0.000)), + ("OD1", 5, (0.610, 1.091, 0.000)), + ("OD2", 5, (0.592, -1.101, -0.003)), + ], + "CYS": [ + ("N", 0, (-0.522, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.524, 0.000, 0.000)), + ("CB", 0, (-0.519, -0.773, -1.212)), + ("O", 3, (0.625, 1.062, -0.000)), + ("SG", 4, (0.728, 1.653, 0.000)), + ], + "GLN": [ + ("N", 0, (-0.526, 1.361, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, 0.000, 0.000)), + ("CB", 0, (-0.525, -0.779, -1.207)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.615, 1.393, 0.000)), + ("CD", 5, (0.587, 1.399, -0.000)), + ("NE2", 6, (0.593, -1.189, -0.001)), + ("OE1", 6, (0.634, 1.060, 0.000)), + ], + "GLU": [ + ("N", 0, (-0.528, 1.361, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, -0.000, -0.000)), + ("CB", 0, (-0.526, -0.781, -1.207)), + ("O", 3, (0.626, 1.062, 0.000)), + ("CG", 4, (0.615, 1.392, 0.000)), + ("CD", 5, (0.600, 1.397, 0.000)), + ("OE1", 6, (0.607, 1.095, -0.000)), + ("OE2", 6, (0.589, -1.104, -0.001)), + ], + "GLY": [ + ("N", 0, (-0.572, 1.337, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.517, -0.000, -0.000)), + ("O", 3, (0.626, 1.062, -0.000)), + ], + "HIS": [ + ("N", 0, (-0.527, 1.360, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, 0.000, 0.000)), + ("CB", 0, (-0.525, -0.778, -1.208)), + ("O", 3, (0.625, 1.063, 0.000)), + ("CG", 4, (0.600, 1.370, -0.000)), + ("CD2", 5, (0.889, -1.021, 0.003)), + ("ND1", 5, (0.744, 1.160, -0.000)), + ("CE1", 5, (2.030, 0.851, 0.002)), + ("NE2", 5, (2.145, -0.466, 0.004)), + ], + "ILE": [ + ("N", 0, (-0.493, 1.373, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, -0.000, -0.000)), + ("CB", 0, (-0.536, -0.793, -1.213)), + ("O", 3, (0.627, 1.062, -0.000)), + ("CG1", 4, (0.534, 1.437, -0.000)), + ("CG2", 4, (0.540, -0.785, -1.199)), + ("CD1", 5, (0.619, 1.391, 0.000)), + ], + "LEU": [ + ("N", 0, (-0.520, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, -0.000)), + ("CB", 0, (-0.522, -0.773, -1.214)), + ("O", 3, (0.625, 1.063, -0.000)), + ("CG", 4, (0.678, 1.371, 0.000)), + ("CD1", 5, (0.530, 1.430, -0.000)), + ("CD2", 5, (0.535, -0.774, 1.200)), + ], + "LYS": [ + ("N", 0, (-0.526, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, 0.000, 0.000)), + ("CB", 0, (-0.524, -0.778, -1.208)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.619, 1.390, 0.000)), + ("CD", 5, (0.559, 1.417, 0.000)), + ("CE", 6, (0.560, 1.416, 0.000)), + ("NZ", 7, (0.554, 1.387, 0.000)), + ], + "MET": [ + ("N", 0, (-0.521, 1.364, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, 0.000, 0.000)), + ("CB", 0, (-0.523, -0.776, -1.210)), + ("O", 3, (0.625, 1.062, -0.000)), + ("CG", 4, (0.613, 1.391, -0.000)), + ("SD", 5, (0.703, 1.695, 0.000)), + ("CE", 6, (0.320, 1.786, -0.000)), + ], + "PHE": [ + ("N", 0, (-0.518, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.524, 0.000, -0.000)), + ("CB", 0, (-0.525, -0.776, -1.212)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.607, 1.377, 0.000)), + ("CD1", 5, (0.709, 1.195, -0.000)), + ("CD2", 5, (0.706, -1.196, 0.000)), + ("CE1", 5, (2.102, 1.198, -0.000)), + ("CE2", 5, (2.098, -1.201, -0.000)), + ("CZ", 5, (2.794, -0.003, -0.001)), + ], + "PRO": [ + ("N", 0, (-0.566, 1.351, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, -0.000, 0.000)), + ("CB", 0, (-0.546, -0.611, -1.293)), + ("O", 3, (0.621, 1.066, 0.000)), + ("CG", 4, (0.382, 1.445, 0.0)), + # ('CD', 5, (0.427, 1.440, 0.0)), + ("CD", 5, (0.477, 1.424, 0.0)), # manually made angle 2 degrees larger + ], + "SER": [ + ("N", 0, (-0.529, 1.360, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, -0.000)), + ("CB", 0, (-0.518, -0.777, -1.211)), + ("O", 3, (0.626, 1.062, -0.000)), + ("OG", 4, (0.503, 1.325, 0.000)), + ], + "THR": [ + ("N", 0, (-0.517, 1.364, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, 0.000, -0.000)), + ("CB", 0, (-0.516, -0.793, -1.215)), + ("O", 3, (0.626, 1.062, 0.000)), + ("CG2", 4, (0.550, -0.718, -1.228)), + ("OG1", 4, (0.472, 1.353, 0.000)), + ], + "TRP": [ + ("N", 0, (-0.521, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, 0.000)), + ("CB", 0, (-0.523, -0.776, -1.212)), + ("O", 3, (0.627, 1.062, 0.000)), + ("CG", 4, (0.609, 1.370, -0.000)), + ("CD1", 5, (0.824, 1.091, 0.000)), + ("CD2", 5, (0.854, -1.148, -0.005)), + ("CE2", 5, (2.186, -0.678, -0.007)), + ("CE3", 5, (0.622, -2.530, -0.007)), + ("NE1", 5, (2.140, 0.690, -0.004)), + ("CH2", 5, (3.028, -2.890, -0.013)), + ("CZ2", 5, (3.283, -1.543, -0.011)), + ("CZ3", 5, (1.715, -3.389, -0.011)), + ], + "TYR": [ + ("N", 0, (-0.522, 1.362, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.524, -0.000, -0.000)), + ("CB", 0, (-0.522, -0.776, -1.213)), + ("O", 3, (0.627, 1.062, -0.000)), + ("CG", 4, (0.607, 1.382, -0.000)), + ("CD1", 5, (0.716, 1.195, -0.000)), + ("CD2", 5, (0.713, -1.194, -0.001)), + ("CE1", 5, (2.107, 1.200, -0.002)), + ("CE2", 5, (2.104, -1.201, -0.003)), + ("OH", 5, (4.168, -0.002, -0.005)), + ("CZ", 5, (2.791, -0.001, -0.003)), + ], + "VAL": [ + ("N", 0, (-0.494, 1.373, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, -0.000, -0.000)), + ("CB", 0, (-0.533, -0.795, -1.213)), + ("O", 3, (0.627, 1.062, -0.000)), + ("CG1", 4, (0.540, 1.429, -0.000)), + ("CG2", 4, (0.533, -0.776, 1.203)), + ], +} + +# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention. +residue_atoms: dict[str, list[str]] = { + "ALA": ["C", "CA", "CB", "N", "O"], + "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"], + "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"], + "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"], + "CYS": ["C", "CA", "CB", "N", "O", "SG"], + "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"], + "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"], + "GLY": ["C", "CA", "N", "O"], + "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"], + "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"], + "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"], + "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"], + "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"], + "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"], + "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"], + "SER": ["C", "CA", "CB", "N", "O", "OG"], + "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"], + "TRP": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "CZ2", "CZ3", "CH2", "N", "NE1", "O"], + "TYR": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O", "OH"], + "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"], +} + +# Naming swaps for ambiguous atom names. +# Due to symmetries in the amino acids the naming of atoms is ambiguous in +# 4 of the 20 amino acids. +# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities +# in LEU, VAL and ARG can be resolved by using the 3d constellations of +# the 'ambiguous' atoms and their neighbours) +# TODO: ^ interpret this +residue_atom_renaming_swaps: dict[str, dict[str, str]] = { + "ASP": {"OD1": "OD2"}, + "GLU": {"OE1": "OE2"}, + "PHE": {"CD1": "CD2", "CE1": "CE2"}, + "TYR": {"CD1": "CD2", "CE1": "CE2"}, +} + +# Van der Waals radii [Angstroem] of the atoms (from Wikipedia) +van_der_waals_radius: dict[str, float] = { + "C": 1.7, + "N": 1.55, + "O": 1.52, + "S": 1.8, +} + +Bond = collections.namedtuple("Bond", ["atom1_name", "atom2_name", "length", "stddev"]) +BondAngle = collections.namedtuple( + "BondAngle", + ["atom1_name", "atom2_name", "atom3name", "angle_rad", "stddev"], +) + + +def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> list: + # Maps strings in a nested list structure to their corresponding index in atom_order + if first_call: + in_list = copy.deepcopy(in_list) + for i in range(len(in_list)): + if isinstance(in_list[i], list): + in_list[i] = map_structure_with_atom_order(in_list[i], first_call=False) + elif isinstance(in_list[i], str): + in_list[i] = atom_order[in_list[i]] + else: + raise TypeError("Unexpected type when mapping nested lists!") + return in_list + + +@functools.cache +def load_stereo_chemical_props() -> tuple[ + Mapping[str, list[Bond]], + Mapping[str, list[Bond]], + Mapping[str, list[BondAngle]], +]: + """Load stereo_chemical_props.txt into a nice structure. + + Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite + edge of the triangle ("residue_virtual_bonds"). + + Returns: + residue_bonds: dict that maps resname --> list of Bond tuples residue_virtual_bonds: dict that maps resname --> + list of Bond tuples residue_bond_angles: dict that maps resname --> list of BondAngle tuples + """ + # TODO: this file should be downloaded in a setup script + stereo_chemical_props = resources.read_text("openfold.resources", "stereo_chemical_props.txt") + + lines_iter = iter(stereo_chemical_props.splitlines()) + # Load bond lengths. + residue_bonds: dict[str, list[Bond]] = {} + next(lines_iter) # Skip header line. + for line in lines_iter: + if line.strip() == "-": + break + bond, resname, bond_length, stddev = line.split() + atom1, atom2 = bond.split("-") + if resname not in residue_bonds: + residue_bonds[resname] = [] + residue_bonds[resname].append(Bond(atom1, atom2, float(bond_length), float(stddev))) + residue_bonds["UNK"] = [] + + # Load bond angles. + residue_bond_angles: dict[str, list[BondAngle]] = {} + next(lines_iter) # Skip empty line. + next(lines_iter) # Skip header line. + for line in lines_iter: + if line.strip() == "-": + break + bond, resname, angle_degree, stddev_degree = line.split() + atom1, atom2, atom3 = bond.split("-") + if resname not in residue_bond_angles: + residue_bond_angles[resname] = [] + residue_bond_angles[resname].append( + BondAngle( + atom1, + atom2, + atom3, + float(angle_degree) / 180.0 * np.pi, + float(stddev_degree) / 180.0 * np.pi, + ) + ) + residue_bond_angles["UNK"] = [] + + def make_bond_key(atom1_name: str, atom2_name: str) -> str: + """Unique key to lookup bonds.""" + return "-".join(sorted([atom1_name, atom2_name])) + + # Translate bond angles into distances ("virtual bonds"). + residue_virtual_bonds: dict[str, list[Bond]] = {} + for resname, bond_angles in residue_bond_angles.items(): + # Create a fast lookup dict for bond lengths. + bond_cache: dict[str, Bond] = {} + for b in residue_bonds[resname]: + bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b + residue_virtual_bonds[resname] = [] + for ba in bond_angles: + bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)] + bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)] + + # Compute distance between atom1 and atom3 using the law of cosines + # c^2 = a^2 + b^2 - 2ab*cos(gamma). + gamma = ba.angle_rad + length = np.sqrt(bond1.length**2 + bond2.length**2 - 2 * bond1.length * bond2.length * np.cos(gamma)) + + # Propagation of uncertainty assuming uncorrelated errors. + dl_outer = 0.5 / length + dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer + dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer + dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer + stddev = np.sqrt( + (dl_dgamma * ba.stddev) ** 2 + (dl_db1 * bond1.stddev) ** 2 + (dl_db2 * bond2.stddev) ** 2 + ) + residue_virtual_bonds[resname].append(Bond(ba.atom1_name, ba.atom3name, length, stddev)) + + return (residue_bonds, residue_virtual_bonds, residue_bond_angles) + + +# Between-residue bond lengths for general bonds (first element) and for Proline +# (second element). +between_res_bond_length_c_n: tuple[float, float] = (1.329, 1.341) +between_res_bond_length_stddev_c_n: tuple[float, float] = (0.014, 0.016) + +# Between-residue cos_angles. +between_res_cos_angles_c_n_ca: tuple[float, float] = (-0.5203, 0.0353) # degrees: 121.352 +- 2.315 +between_res_cos_angles_ca_c_n: tuple[float, float] = (-0.4473, 0.0311) # degrees: 116.568 +- 1.995 + +# This mapping is used when we need to store atom data in a format that requires +# fixed atom data size for every residue (e.g. a numpy array). +atom_types: list[str] = [ + "N", + "CA", + "C", + "CB", + "O", + "CG", + "CG1", + "CG2", + "OG", + "OG1", + "SG", + "CD", + "CD1", + "CD2", + "ND1", + "ND2", + "OD1", + "OD2", + "SD", + "CE", + "CE1", + "CE2", + "CE3", + "NE", + "NE1", + "NE2", + "OE1", + "OE2", + "CH2", + "NH1", + "NH2", + "OH", + "CZ", + "CZ2", + "CZ3", + "NZ", + "OXT", +] +atom_order: dict[str, int] = {atom_type: i for i, atom_type in enumerate(atom_types)} +atom_type_num = len(atom_types) # := 37. + +# A compact atom encoding with 14 columns +# pylint: disable=line-too-long +restype_name_to_atom14_names: dict[str, list[str]] = { + "ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""], + "ARG": ["N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2", "", "", ""], + "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2", "", "", "", "", "", ""], + "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2", "", "", "", "", "", ""], + "CYS": ["N", "CA", "C", "O", "CB", "SG", "", "", "", "", "", "", "", ""], + "GLN": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2", "", "", "", "", ""], + "GLU": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2", "", "", "", "", ""], + "GLY": ["N", "CA", "C", "O", "", "", "", "", "", "", "", "", "", ""], + "HIS": ["N", "CA", "C", "O", "CB", "CG", "ND1", "CD2", "CE1", "NE2", "", "", "", ""], + "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1", "", "", "", "", "", ""], + "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "", "", "", "", "", ""], + "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ", "", "", "", "", ""], + "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE", "", "", "", "", "", ""], + "PHE": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "", "", ""], + "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD", "", "", "", "", "", "", ""], + "SER": ["N", "CA", "C", "O", "CB", "OG", "", "", "", "", "", "", "", ""], + "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2", "", "", "", "", "", "", ""], + "TRP": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "NE1", "CE2", "CE3", "CZ2", "CZ3", "CH2"], + "TYR": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "OH", "", ""], + "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "", "", "", "", "", "", ""], + "UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""], +} +# pylint: enable=line-too-long + + +# This is the standard residue order when coding AA type as a number. +# Reproduce it by taking 3-letter AA codes and sorting them alphabetically. +restypes: list[str] = [ + "A", + "R", + "N", + "D", + "C", + "Q", + "E", + "G", + "H", + "I", + "L", + "K", + "M", + "F", + "P", + "S", + "T", + "W", + "Y", + "V", +] +restype_order: dict[str, int] = {restype: i for i, restype in enumerate(restypes)} +restype_num = len(restypes) # := 20. +unk_restype_index = restype_num # Catch-all index for unknown restypes. + +restypes_with_x: list[str] = restypes + ["X"] +restype_order_with_x: dict[str, int] = {restype: i for i, restype in enumerate(restypes_with_x)} + + +def sequence_to_onehot(sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool = False) -> np.ndarray: + """Maps the given sequence into a one-hot encoded matrix. + + Args: + sequence: An amino acid sequence. + mapping: A dictionary mapping amino acids to integers. + map_unknown_to_x: If True, any amino acid that is not in the mapping will be + mapped to the unknown amino acid 'X'. If the mapping doesn't contain amino acid 'X', an error will be thrown. + If False, any amino acid not in the mapping will throw an error. + + Returns: + A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of the sequence. + + Raises: + ValueError: If the mapping doesn't contain values from 0 to + num_unique_aas - 1 without any gaps. + """ + num_entries = max(mapping.values()) + 1 + + if sorted(set(mapping.values())) != list(range(num_entries)): + raise ValueError( + "The mapping must have values from 0 to num_unique_aas-1 without any gaps. Got: %s" + % sorted(mapping.values()) + ) + + one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32) + + for aa_index, aa_type in enumerate(sequence): + if map_unknown_to_x: + if aa_type.isalpha() and aa_type.isupper(): + aa_id = mapping.get(aa_type, mapping["X"]) + else: + raise ValueError(f"Invalid character in the sequence: {aa_type}") + else: + aa_id = mapping[aa_type] + one_hot_arr[aa_index, aa_id] = 1 + + return one_hot_arr + + +restype_1to3: dict[str, str] = { + "A": "ALA", + "R": "ARG", + "N": "ASN", + "D": "ASP", + "C": "CYS", + "Q": "GLN", + "E": "GLU", + "G": "GLY", + "H": "HIS", + "I": "ILE", + "L": "LEU", + "K": "LYS", + "M": "MET", + "F": "PHE", + "P": "PRO", + "S": "SER", + "T": "THR", + "W": "TRP", + "Y": "TYR", + "V": "VAL", +} + + +# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple +# 1-to-1 mapping of 3 letter names to one letter names. The latter contains +# many more, and less common, three letter names as keys and maps many of these +# to the same one letter name (including 'X' and 'U' which we don't use here). +restype_3to1: dict[str, str] = {v: k for k, v in restype_1to3.items()} + +# Define a restype name for all unknown residues. +unk_restype = "UNK" + +resnames: list[str] = [restype_1to3[r] for r in restypes] + [unk_restype] +resname_to_idx: dict[str, int] = {resname: i for i, resname in enumerate(resnames)} + + +# The mapping here uses hhblits convention, so that B is mapped to D, J and O +# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the +# remaining 20 amino acids are kept in alphabetical order. +# There are 2 non-amino acid codes, X (representing any amino acid) and +# "-" representing a missing amino acid in an alignment. The id for these +# codes is put at the end (20 and 21) so that they can easily be ignored if +# desired. +HHBLITS_AA_TO_ID: dict[str, int] = { + "A": 0, + "B": 2, + "C": 1, + "D": 2, + "E": 3, + "F": 4, + "G": 5, + "H": 6, + "I": 7, + "J": 20, + "K": 8, + "L": 9, + "M": 10, + "N": 11, + "O": 20, + "P": 12, + "Q": 13, + "R": 14, + "S": 15, + "T": 16, + "U": 1, + "V": 17, + "W": 18, + "X": 20, + "Y": 19, + "Z": 3, + "-": 21, +} + +# Partial inversion of HHBLITS_AA_TO_ID. +ID_TO_HHBLITS_AA: dict[int, str] = { + 0: "A", + 1: "C", # Also U. + 2: "D", # Also B. + 3: "E", # Also Z. + 4: "F", + 5: "G", + 6: "H", + 7: "I", + 8: "K", + 9: "L", + 10: "M", + 11: "N", + 12: "P", + 13: "Q", + 14: "R", + 15: "S", + 16: "T", + 17: "V", + 18: "W", + 19: "Y", + 20: "X", # Includes J and O. + 21: "-", +} + +restypes_with_x_and_gap: list[str] = restypes + ["X", "-"] +MAP_HHBLITS_AATYPE_TO_OUR_AATYPE: tuple[int, ...] = tuple( + restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i]) for i in range(len(restypes_with_x_and_gap)) +) + + +def _make_standard_atom_mask() -> np.ndarray: + """Returns [num_res_types, num_atom_types] mask array.""" + # +1 to account for unknown (all 0s). + mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32) + for restype, restype_letter in enumerate(restypes): + restype_name = restype_1to3[restype_letter] + atom_names = residue_atoms[restype_name] + for atom_name in atom_names: + atom_type = atom_order[atom_name] + mask[restype, atom_type] = 1 + return mask + + +STANDARD_ATOM_MASK = _make_standard_atom_mask() + + +# A one hot representation for the first and second atoms defining the axis +# of rotation for each chi-angle in each residue. +def chi_angle_atom(atom_index: int) -> np.ndarray: + """Define chi-angle rigid groups via one-hot representations.""" + chi_angles_index = {} + one_hots = [] + + for k, v in chi_angles_atoms.items(): + indices = [atom_types.index(s[atom_index]) for s in v] + indices.extend([-1] * (4 - len(indices))) + chi_angles_index[k] = indices + + for r in restypes: + res3 = restype_1to3[r] + one_hot = np.eye(atom_type_num)[chi_angles_index[res3]] + one_hots.append(one_hot) + + one_hots.append(np.zeros([4, atom_type_num])) # Add zeros for residue `X`. + one_hot = np.stack(one_hots, axis=0) + one_hot = np.transpose(one_hot, [0, 2, 1]) + + return one_hot + + +chi_atom_1_one_hot = chi_angle_atom(1) +chi_atom_2_one_hot = chi_angle_atom(2) + +# An array like chi_angles_atoms but using indices rather than names. +chi_angles_atom_indices_list: list[list[list[str]]] = [chi_angles_atoms[restype_1to3[r]] for r in restypes] +chi_angles_atom_indices_ours: list = map_structure_with_atom_order(chi_angles_atom_indices_list) +chi_angles_atom_indices = np.array( + [chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms))) for chi_atoms in chi_angles_atom_indices_list] +) + +# Mapping from (res_name, atom_name) pairs to the atom's chi group index +# and atom index within that group. +chi_groups_for_atom: dict[tuple[str, str], list[tuple[int, int]]] = collections.defaultdict(list) +for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items(): + for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res): + for atom_i, atom in enumerate(chi_group): + chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i)) +chi_groups_for_atom = dict(chi_groups_for_atom) + + +def _make_rigid_transformation_4x4(ex: np.ndarray, ey: np.ndarray, translation: np.ndarray) -> np.ndarray: + """Create a rigid 4x4 transformation matrix from two axes and transl.""" + # Normalize ex. + ex_normalized = ex / np.linalg.norm(ex) + + # make ey perpendicular to ex + ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized + ey_normalized /= np.linalg.norm(ey_normalized) + + # compute ez as cross product + eznorm = np.cross(ex_normalized, ey_normalized) + m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose() + m = np.concatenate([m, [[0.0, 0.0, 0.0, 1.0]]], axis=0) + return m + + +# create an array with (restype, atomtype) --> rigid_group_idx +# and an array with (restype, atomtype, coord) for the atom positions +# and compute affine transformation matrices (4,4) from one rigid group to the +# previous group +restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=int) +restype_atom37_mask = np.zeros([21, 37], dtype=np.float32) +restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32) +restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=int) +restype_atom14_mask = np.zeros([21, 14], dtype=np.float32) +restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32) +restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32) + + +def _make_rigid_group_constants() -> None: + """Fill the arrays above.""" + for restype, restype_letter in enumerate(restypes): + resname = restype_1to3[restype_letter] + for atomname, group_idx, atom_position in rigid_group_atom_positions[resname]: + atomtype = atom_order[atomname] + restype_atom37_to_rigid_group[restype, atomtype] = group_idx + restype_atom37_mask[restype, atomtype] = 1 + restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position + + atom14idx = restype_name_to_atom14_names[resname].index(atomname) + restype_atom14_to_rigid_group[restype, atom14idx] = group_idx + restype_atom14_mask[restype, atom14idx] = 1 + restype_atom14_rigid_group_positions[restype, atom14idx, :] = atom_position + + for restype, restype_letter in enumerate(restypes): + resname = restype_1to3[restype_letter] + atom_positions: dict[str, np.ndarray] = { + name: np.array(pos) for name, _, pos in rigid_group_atom_positions[resname] + } + + # backbone to backbone is the identity transform + restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4) + + # pre-omega-frame to backbone (currently dummy identity matrix) + restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4) + + # phi-frame to backbone + mat = _make_rigid_transformation_4x4( + ex=atom_positions["N"] - atom_positions["CA"], + ey=np.array([1.0, 0.0, 0.0]), + translation=atom_positions["N"], + ) + restype_rigid_group_default_frame[restype, 2, :, :] = mat + + # psi-frame to backbone + mat = _make_rigid_transformation_4x4( + ex=atom_positions["C"] - atom_positions["CA"], + ey=atom_positions["CA"] - atom_positions["N"], + translation=atom_positions["C"], + ) + restype_rigid_group_default_frame[restype, 3, :, :] = mat + + # chi1-frame to backbone + if chi_angles_mask[restype][0]: + base_atom_names = chi_angles_atoms[resname][0] + base_atom_positions = [atom_positions[name] for name in base_atom_names] + mat = _make_rigid_transformation_4x4( + ex=base_atom_positions[2] - base_atom_positions[1], + ey=base_atom_positions[0] - base_atom_positions[1], + translation=base_atom_positions[2], + ) + restype_rigid_group_default_frame[restype, 4, :, :] = mat + + # chi2-frame to chi1-frame + # chi3-frame to chi2-frame + # chi4-frame to chi3-frame + # luckily all rotation axes for the next frame start at (0,0,0) of the + # previous frame + for chi_idx in range(1, 4): + if chi_angles_mask[restype][chi_idx]: + axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2] + axis_end_atom_position = atom_positions[axis_end_atom_name] + mat = _make_rigid_transformation_4x4( + ex=axis_end_atom_position, + ey=np.array([-1.0, 0.0, 0.0]), + translation=axis_end_atom_position, + ) + restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat + + +_make_rigid_group_constants() + + +def make_atom14_dists_bounds( + overlap_tolerance: float = 1.5, + bond_length_tolerance_factor: int = 15, +) -> dict[str, np.ndarray]: + """compute upper and lower bounds for bonds to assess violations.""" + restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32) + restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32) + restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32) + residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props() + for restype, restype_letter in enumerate(restypes): + resname = restype_1to3[restype_letter] + atom_list = restype_name_to_atom14_names[resname] + + # create lower and upper bounds for clashes + for atom1_idx, atom1_name in enumerate(atom_list): + if not atom1_name: + continue + atom1_radius = van_der_waals_radius[atom1_name[0]] + for atom2_idx, atom2_name in enumerate(atom_list): + if (not atom2_name) or atom1_idx == atom2_idx: + continue + atom2_radius = van_der_waals_radius[atom2_name[0]] + lower = atom1_radius + atom2_radius - overlap_tolerance + upper = 1e10 + restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower + restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower + restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper + restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper + + # overwrite lower and upper bounds for bonds and angles + for b in residue_bonds[resname] + residue_virtual_bonds[resname]: + atom1_idx = atom_list.index(b.atom1_name) + atom2_idx = atom_list.index(b.atom2_name) + lower = b.length - bond_length_tolerance_factor * b.stddev + upper = b.length + bond_length_tolerance_factor * b.stddev + restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower + restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower + restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper + restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper + restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev + restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev + return { + "lower_bound": restype_atom14_bond_lower_bound, # shape (21,14,14) + "upper_bound": restype_atom14_bond_upper_bound, # shape (21,14,14) + "stddev": restype_atom14_bond_stddev, # shape (21,14,14) + } + + +restype_atom14_ambiguous_atoms = np.zeros((21, 14), dtype=np.float32) +restype_atom14_ambiguous_atoms_swap_idx: np.ndarray = np.tile(np.arange(14, dtype=int), (21, 1)) + + +def _make_atom14_ambiguity_feats() -> None: + for res, pairs in residue_atom_renaming_swaps.items(): + res_idx = restype_order[restype_3to1[res]] + for atom1, atom2 in pairs.items(): + atom1_idx = restype_name_to_atom14_names[res].index(atom1) + atom2_idx = restype_name_to_atom14_names[res].index(atom2) + restype_atom14_ambiguous_atoms[res_idx, atom1_idx] = 1 + restype_atom14_ambiguous_atoms[res_idx, atom2_idx] = 1 + restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom1_idx] = atom2_idx + restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom2_idx] = atom1_idx + + +_make_atom14_ambiguity_feats() + + +def aatype_to_str_sequence(aatype: Sequence[int]) -> str: + return "".join([restypes_with_x[aatype[i]] for i in range(len(aatype))]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e2bc25cb6d53f0428be3f6c1c25e67d67edab739 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py @@ -0,0 +1,1243 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Sequence +from functools import cache +from typing import Any, Callable + +import numpy as np +import torch + + +def rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """ + Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting. + + Args: + a: [*, 3, 3] left multiplicand + b: [*, 3, 3] right multiplicand + Returns: + The product ab + """ + + def row_mul(i: int) -> torch.Tensor: + return torch.stack( + [ + a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0], + a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1], + a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2], + ], + dim=-1, + ) + + return torch.stack( + [ + row_mul(0), + row_mul(1), + row_mul(2), + ], + dim=-2, + ) + + +def rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor: + """ + Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting. + + Args: + r: [*, 3, 3] rotation matrices + t: [*, 3] coordinate tensors + Returns: + [*, 3] rotated coordinates + """ + x, y, z = torch.unbind(t, dim=-1) + return torch.stack( + [ + r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z, + r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z, + r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z, + ], + dim=-1, + ) + + +@cache +def identity_rot_mats( + batch_dims: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, +) -> torch.Tensor: + rots = torch.eye(3, dtype=dtype, device=device, requires_grad=requires_grad) + rots = rots.view(*((1,) * len(batch_dims)), 3, 3) + rots = rots.expand(*batch_dims, -1, -1) + rots = rots.contiguous() + + return rots + + +@cache +def identity_trans( + batch_dims: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, +) -> torch.Tensor: + trans = torch.zeros((*batch_dims, 3), dtype=dtype, device=device, requires_grad=requires_grad) + return trans + + +@cache +def identity_quats( + batch_dims: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, +) -> torch.Tensor: + quat = torch.zeros((*batch_dims, 4), dtype=dtype, device=device, requires_grad=requires_grad) + + with torch.no_grad(): + quat[..., 0] = 1 + + return quat + + +_quat_elements: list[str] = ["a", "b", "c", "d"] +_qtr_keys: list[str] = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements] +_qtr_ind_dict: dict[str, int] = {key: ind for ind, key in enumerate(_qtr_keys)} + + +def _to_mat(pairs: list[tuple[str, int]]) -> np.ndarray: + mat = np.zeros((4, 4)) + for key, value in pairs: + ind = _qtr_ind_dict[key] + mat[ind // 4][ind % 4] = value + + return mat + + +_QTR_MAT = np.zeros((4, 4, 3, 3)) +_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)]) +_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)]) +_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)]) +_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)]) +_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)]) +_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)]) +_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)]) +_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)]) +_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)]) + + +def quat_to_rot(quat: torch.Tensor) -> torch.Tensor: + """ + Converts a quaternion to a rotation matrix. + + Args: + quat: [*, 4] quaternions + Returns: + [*, 3, 3] rotation matrices + """ + # [*, 4, 4] + quat = quat[..., None] * quat[..., None, :] + + # [4, 4, 3, 3] + mat = _get_quat("_QTR_MAT", dtype=quat.dtype, device=quat.device) + + # [*, 4, 4, 3, 3] + shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape) + quat = quat[..., None, None] * shaped_qtr_mat + + # [*, 3, 3] + return torch.sum(quat, dim=(-3, -4)) + + +def rot_to_quat(rot: torch.Tensor) -> torch.Tensor: + if rot.shape[-2:] != (3, 3): + raise ValueError("Input rotation is incorrectly shaped") + + [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = [[rot[..., i, j] for j in range(3)] for i in range(3)] + + k = [ + [ + xx + yy + zz, + zy - yz, + xz - zx, + yx - xy, + ], + [ + zy - yz, + xx - yy - zz, + xy + yx, + xz + zx, + ], + [ + xz - zx, + xy + yx, + yy - xx - zz, + yz + zy, + ], + [ + yx - xy, + xz + zx, + yz + zy, + zz - xx - yy, + ], + ] + + _, vectors = torch.linalg.eigh((1.0 / 3.0) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2)) + return vectors[..., -1] + + +_QUAT_MULTIPLY = np.zeros((4, 4, 4)) +_QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]] + +_QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]] + +_QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0], [0, 1, 0, 0]] + +_QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0], [1, 0, 0, 0]] + +_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :] + +_CACHED_QUATS: dict[str, np.ndarray] = { + "_QTR_MAT": _QTR_MAT, + "_QUAT_MULTIPLY": _QUAT_MULTIPLY, + "_QUAT_MULTIPLY_BY_VEC": _QUAT_MULTIPLY_BY_VEC, +} + + +@cache +def _get_quat(quat_key: str, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device) + + +def quat_multiply(quat1: torch.Tensor, quat2: torch.Tensor) -> torch.Tensor: + """Multiply a quaternion by another quaternion.""" + mat = _get_quat("_QUAT_MULTIPLY", dtype=quat1.dtype, device=quat1.device) + reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape) + return torch.sum(reshaped_mat * quat1[..., :, None, None] * quat2[..., None, :, None], dim=(-3, -2)) + + +def quat_multiply_by_vec(quat: torch.Tensor, vec: torch.Tensor) -> torch.Tensor: + """Multiply a quaternion by a pure-vector quaternion.""" + mat = _get_quat("_QUAT_MULTIPLY_BY_VEC", dtype=quat.dtype, device=quat.device) + reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape) + return torch.sum(reshaped_mat * quat[..., :, None, None] * vec[..., None, :, None], dim=(-3, -2)) + + +def invert_rot_mat(rot_mat: torch.Tensor) -> torch.Tensor: + return rot_mat.transpose(-1, -2) + + +def invert_quat(quat: torch.Tensor) -> torch.Tensor: + quat_prime = quat.clone() + quat_prime[..., 1:] *= -1 + inv = quat_prime / torch.sum(quat**2, dim=-1, keepdim=True) + return inv + + +class Rotation: + """ + A 3D rotation. Depending on how the object is initialized, the rotation is represented by either a rotation matrix + or a quaternion, though both formats are made available by helper functions. To simplify gradient computation, the + underlying format of the rotation cannot be changed in-place. Like Rigid, the class is designed to mimic the + behavior of a torch Tensor, almost as if each Rotation object were a tensor of rotations, in one format or another. + """ + + def __init__( + self, + rot_mats: torch.Tensor | None = None, + quats: torch.Tensor | None = None, + normalize_quats: bool = True, + ): + """ + Args: + rot_mats: + A [*, 3, 3] rotation matrix tensor. Mutually exclusive with quats + quats: + A [*, 4] quaternion. Mutually exclusive with rot_mats. If normalize_quats is not True, must be a unit + quaternion + normalize_quats: + If quats is specified, whether to normalize quats + """ + if (rot_mats is None and quats is None) or (rot_mats is not None and quats is not None): + raise ValueError("Exactly one input argument must be specified") + + if (rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or (quats is not None and quats.shape[-1] != 4): + raise ValueError("Incorrectly shaped rotation matrix or quaternion") + + # Force full-precision + if quats is not None: + quats = quats.to(dtype=torch.float32) + if rot_mats is not None: + rot_mats = rot_mats.to(dtype=torch.float32) + + if quats is not None and normalize_quats: + quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True) + + self._rot_mats = rot_mats + self._quats = quats + + @staticmethod + def identity( + shape, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, + fmt: str = "quat", + ) -> Rotation: + """ + Returns an identity Rotation. + + Args: + shape: + The "shape" of the resulting Rotation object. See documentation for the shape property + dtype: + The torch dtype for the rotation + device: + The torch device for the new rotation + requires_grad: + Whether the underlying tensors in the new rotation object should require gradient computation + fmt: + One of "quat" or "rot_mat". Determines the underlying format of the new object's rotation + Returns: + A new identity rotation + """ + if fmt == "rot_mat": + rot_mats = identity_rot_mats( + shape, + dtype, + device, + requires_grad, + ) + return Rotation(rot_mats=rot_mats, quats=None) + elif fmt == "quat": + quats = identity_quats(shape, dtype, device, requires_grad) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError(f"Invalid format: f{fmt}") + + # Magic methods + + def __getitem__(self, index: Any) -> Rotation: + """ + Allows torch-style indexing over the virtual shape of the rotation object. See documentation for the shape + property. + + Args: + index: + A torch index. E.g. (1, 3, 2), or (slice(None,)) + Returns: + The indexed rotation + """ + if type(index) is not tuple: + index = (index,) + + if self._rot_mats is not None: + rot_mats = self._rot_mats[index + (slice(None), slice(None))] + return Rotation(rot_mats=rot_mats) + elif self._quats is not None: + quats = self._quats[index + (slice(None),)] + return Rotation(quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def __mul__(self, right: torch.Tensor) -> Rotation: + """ + Pointwise left multiplication of the rotation with a tensor. Can be used to e.g. mask the Rotation. + + Args: + right: + The tensor multiplicand + Returns: + The product + """ + if not (isinstance(right, torch.Tensor)): + raise TypeError("The other multiplicand must be a Tensor") + + if self._rot_mats is not None: + rot_mats = self._rot_mats * right[..., None, None] + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = self._quats * right[..., None] + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def __rmul__(self, left: torch.Tensor) -> Rotation: + """ + Reverse pointwise multiplication of the rotation with a tensor. + + Args: + left: + The left multiplicand + Returns: + The product + """ + return self.__mul__(left) + + # Properties + + @property + def shape(self) -> torch.Size: + """ + Returns the virtual shape of the rotation object. This shape is defined as the batch dimensions of the + underlying rotation matrix or quaternion. If the Rotation was initialized with a [10, 3, 3] rotation matrix + tensor, for example, the resulting shape would be [10]. + + Returns: + The virtual shape of the rotation object + """ + if self._rot_mats is not None: + return self._rot_mats.shape[:-2] + elif self._quats is not None: + return self._quats.shape[:-1] + else: + raise ValueError("Both rotations are None") + + @property + def dtype(self) -> torch.dtype: + """ + Returns the dtype of the underlying rotation. + + Returns: + The dtype of the underlying rotation + """ + if self._rot_mats is not None: + return self._rot_mats.dtype + elif self._quats is not None: + return self._quats.dtype + else: + raise ValueError("Both rotations are None") + + @property + def device(self) -> torch.device: + """ + The device of the underlying rotation + + Returns: + The device of the underlying rotation + """ + if self._rot_mats is not None: + return self._rot_mats.device + elif self._quats is not None: + return self._quats.device + else: + raise ValueError("Both rotations are None") + + @property + def requires_grad(self) -> bool: + """ + Returns the requires_grad property of the underlying rotation + + Returns: + The requires_grad property of the underlying tensor + """ + if self._rot_mats is not None: + return self._rot_mats.requires_grad + elif self._quats is not None: + return self._quats.requires_grad + else: + raise ValueError("Both rotations are None") + + def get_rot_mats(self) -> torch.Tensor: + """ + Returns the underlying rotation as a rotation matrix tensor. + + Returns: + The rotation as a rotation matrix tensor + """ + if self._rot_mats is not None: + return self._rot_mats + elif self._quats is not None: + return quat_to_rot(self._quats) + else: + raise ValueError("Both rotations are None") + + def get_quats(self) -> torch.Tensor: + """ + Returns the underlying rotation as a quaternion tensor. + + Depending on whether the Rotation was initialized with a quaternion, this function may call torch.linalg.eigh. + + Returns: + The rotation as a quaternion tensor. + """ + if self._rot_mats is not None: + return rot_to_quat(self._rot_mats) + elif self._quats is not None: + return self._quats + else: + raise ValueError("Both rotations are None") + + def get_cur_rot(self) -> torch.Tensor: + """ + Return the underlying rotation in its current form + + Returns: + The stored rotation + """ + if self._rot_mats is not None: + return self._rot_mats + elif self._quats is not None: + return self._quats + else: + raise ValueError("Both rotations are None") + + # Rotation functions + + def compose_q_update_vec(self, q_update_vec: torch.Tensor, normalize_quats: bool = True) -> Rotation: + """ + Returns a new quaternion Rotation after updating the current object's underlying rotation with a quaternion + update, formatted as a [*, 3] tensor whose final three columns represent x, y, z such that (1, x, y, z) is the + desired (not necessarily unit) quaternion update. + + Args: + q_update_vec: + A [*, 3] quaternion update tensor + normalize_quats: + Whether to normalize the output quaternion + Returns: + An updated Rotation + """ + quats = self.get_quats() + new_quats = quats + quat_multiply_by_vec(quats, q_update_vec) + return Rotation( + rot_mats=None, + quats=new_quats, + normalize_quats=normalize_quats, + ) + + def compose_r(self, r: Rotation) -> Rotation: + """ + Compose the rotation matrices of the current Rotation object with those of another. + + Args: + r: + An update rotation object + Returns: + An updated rotation object + """ + r1 = self.get_rot_mats() + r2 = r.get_rot_mats() + new_rot_mats = rot_matmul(r1, r2) + return Rotation(rot_mats=new_rot_mats, quats=None) + + def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation: + """ + Compose the quaternions of the current Rotation object with those of another. + + Depending on whether either Rotation was initialized with quaternions, this function may call + torch.linalg.eigh. + + Args: + r: + An update rotation object + Returns: + An updated rotation object + """ + q1 = self.get_quats() + q2 = r.get_quats() + new_quats = quat_multiply(q1, q2) + return Rotation(rot_mats=None, quats=new_quats, normalize_quats=normalize_quats) + + def apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Apply the current Rotation as a rotation matrix to a set of 3D coordinates. + + Args: + pts: + A [*, 3] set of points + Returns: + [*, 3] rotated points + """ + rot_mats = self.get_rot_mats() + return rot_vec_mul(rot_mats, pts) + + def invert_apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + The inverse of the apply() method. + + Args: + pts: + A [*, 3] set of points + Returns: + [*, 3] inverse-rotated points + """ + rot_mats = self.get_rot_mats() + inv_rot_mats = invert_rot_mat(rot_mats) + return rot_vec_mul(inv_rot_mats, pts) + + def invert(self) -> Rotation: + """ + Returns the inverse of the current Rotation. + + Returns: + The inverse of the current Rotation + """ + if self._rot_mats is not None: + return Rotation(rot_mats=invert_rot_mat(self._rot_mats), quats=None) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=invert_quat(self._quats), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + # "Tensor" stuff + + def unsqueeze(self, dim: int) -> Rotation: + """ + Analogous to torch.unsqueeze. The dimension is relative to the shape of the Rotation object. + + Args: + dim: A positive or negative dimension index. + Returns: + The unsqueezed Rotation. + """ + if dim >= len(self.shape): + raise ValueError("Invalid dimension") + + if self._rot_mats is not None: + rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2) + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + @staticmethod + def cat(rs: Sequence[Rotation], dim: int) -> Rotation: + """ + Concatenates rotations along one of the batch dimensions. Analogous to torch.cat(). + + Note that the output of this operation is always a rotation matrix, regardless of the format of input + rotations. + + Args: + rs: + A list of rotation objects + dim: + The dimension along which the rotations should be concatenated + Returns: + A concatenated Rotation object in rotation matrix format + """ + rot_mats = torch.cat( + [r.get_rot_mats() for r in rs], + dim=dim if dim >= 0 else dim - 2, + ) + + return Rotation(rot_mats=rot_mats, quats=None) + + def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rotation: + """ + Apply a Tensor -> Tensor function to underlying rotation tensors, mapping over the rotation dimension(s). Can + be used e.g. to sum out a one-hot batch dimension. + + Args: + fn: + A Tensor -> Tensor function to be mapped over the Rotation + Returns: + The transformed Rotation object + """ + if self._rot_mats is not None: + rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,)) + rot_mats = torch.stack(list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1) + rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3)) + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = torch.stack(list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def cuda(self) -> Rotation: + """ + Analogous to the cuda() method of torch Tensors + + Returns: + A copy of the Rotation in CUDA memory + """ + if self._rot_mats is not None: + return Rotation(rot_mats=self._rot_mats.cuda(), quats=None) + elif self._quats is not None: + return Rotation(rot_mats=None, quats=self._quats.cuda(), normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def to(self, device: torch.device | None, dtype: torch.dtype | None) -> Rotation: + """ + Analogous to the to() method of torch Tensors + + Args: + device: + A torch device + dtype: + A torch dtype + Returns: + A copy of the Rotation using the new device and dtype + """ + if self._rot_mats is not None: + return Rotation( + rot_mats=self._rot_mats.to(device=device, dtype=dtype), + quats=None, + ) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=self._quats.to(device=device, dtype=dtype), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + def detach(self) -> Rotation: + """ + Returns a copy of the Rotation whose underlying Tensor has been detached from its torch graph. + + Returns: + A copy of the Rotation whose underlying Tensor has been detached from its torch graph + """ + if self._rot_mats is not None: + return Rotation(rot_mats=self._rot_mats.detach(), quats=None) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=self._quats.detach(), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + +class Rigid: + """ + A class representing a rigid transformation. Little more than a wrapper around two objects: a Rotation object and a + [*, 3] translation Designed to behave approximately like a single torch tensor with the shape of the shared batch + dimensions of its component parts. + """ + + def __init__(self, rots: Rotation | None, trans: torch.Tensor | None): + """ + Args: + rots: A [*, 3, 3] rotation tensor + trans: A corresponding [*, 3] translation tensor + """ + # (we need device, dtype, etc. from at least one input) + + batch_dims, dtype, device, requires_grad = None, None, None, None + if trans is not None: + batch_dims = trans.shape[:-1] + dtype = trans.dtype + device = trans.device + requires_grad = trans.requires_grad + elif rots is not None: + batch_dims = rots.shape + dtype = rots.dtype + device = rots.device + requires_grad = rots.requires_grad + else: + raise ValueError("At least one input argument must be specified") + + if rots is None: + rots = Rotation.identity( + batch_dims, + dtype, + device, + requires_grad, + ) + elif trans is None: + trans = identity_trans( + batch_dims, + dtype, + device, + requires_grad, + ) + + assert rots is not None + assert trans is not None + + if (rots.shape != trans.shape[:-1]) or (rots.device != trans.device): + raise ValueError("Rots and trans incompatible") + + # Force full precision. Happens to the rotations automatically. + trans = trans.to(dtype=torch.float32) + + self._rots = rots + self._trans = trans + + @staticmethod + def identity( + shape: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, + fmt: str = "quat", + ) -> Rigid: + """ + Constructs an identity transformation. + + Args: + shape: + The desired shape + dtype: + The dtype of both internal tensors + device: + The device of both internal tensors + requires_grad: + Whether grad should be enabled for the internal tensors + Returns: + The identity transformation + """ + return Rigid( + Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt), + identity_trans(shape, dtype, device, requires_grad), + ) + + def __getitem__(self, index: Any) -> Rigid: + """ + Indexes the affine transformation with PyTorch-style indices. The index is applied to the shared dimensions of + both the rotation and the translation. + + E.g.:: + + r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None) t = Rigid(r, torch.rand(10, 10, 3)) indexed = + t[3, 4:6] assert(indexed.shape == (2,)) assert(indexed.get_rots().shape == (2,)) + assert(indexed.get_trans().shape == (2, 3)) + + Args: + index: A standard torch tensor index. E.g. 8, (10, None, 3), + or (3, slice(0, 1, None)) + Returns: + The indexed tensor + """ + if type(index) is not tuple: + index = (index,) + + return Rigid( + self._rots[index], + self._trans[index + (slice(None),)], + ) + + def __mul__(self, right: torch.Tensor) -> Rigid: + """ + Pointwise left multiplication of the transformation with a tensor. Can be used to e.g. mask the Rigid. + + Args: + right: + The tensor multiplicand + Returns: + The product + """ + if not (isinstance(right, torch.Tensor)): + raise TypeError("The other multiplicand must be a Tensor") + + new_rots = self._rots * right + new_trans = self._trans * right[..., None] + + return Rigid(new_rots, new_trans) + + def __rmul__(self, left: torch.Tensor) -> Rigid: + """ + Reverse pointwise multiplication of the transformation with a tensor. + + Args: + left: + The left multiplicand + Returns: + The product + """ + return self.__mul__(left) + + @property + def shape(self) -> torch.Size: + """ + Returns the shape of the shared dimensions of the rotation and the translation. + + Returns: + The shape of the transformation + """ + return self._trans.shape[:-1] + + @property + def device(self) -> torch.device: + """ + Returns the device on which the Rigid's tensors are located. + + Returns: + The device on which the Rigid's tensors are located + """ + return self._trans.device + + def get_rots(self) -> Rotation: + """ + Getter for the rotation. + + Returns: + The rotation object + """ + return self._rots + + def get_trans(self) -> torch.Tensor: + """ + Getter for the translation. + + Returns: + The stored translation + """ + return self._trans + + def compose_q_update_vec(self, q_update_vec: torch.Tensor) -> Rigid: + """ + Composes the transformation with a quaternion update vector of shape [*, 6], where the final 6 columns + represent the x, y, and z values of a quaternion of form (1, x, y, z) followed by a 3D translation. + + Args: + q_vec: The quaternion update vector. + Returns: + The composed transformation. + """ + q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:] + new_rots = self._rots.compose_q_update_vec(q_vec) + + trans_update = self._rots.apply(t_vec) + new_translation = self._trans + trans_update + + return Rigid(new_rots, new_translation) + + def compose(self, r: Rigid) -> Rigid: + """ + Composes the current rigid object with another. + + Args: + r: + Another Rigid object + Returns: + The composition of the two transformations + """ + new_rot = self._rots.compose_r(r._rots) + new_trans = self._rots.apply(r._trans) + self._trans + return Rigid(new_rot, new_trans) + + def apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Applies the transformation to a coordinate tensor. + + Args: + pts: A [*, 3] coordinate tensor. + Returns: + The transformed points. + """ + rotated = self._rots.apply(pts) + return rotated + self._trans + + def invert_apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Applies the inverse of the transformation to a coordinate tensor. + + Args: + pts: A [*, 3] coordinate tensor + Returns: + The transformed points. + """ + pts = pts - self._trans + return self._rots.invert_apply(pts) + + def invert(self) -> Rigid: + """ + Inverts the transformation. + + Returns: + The inverse transformation. + """ + rot_inv = self._rots.invert() + trn_inv = rot_inv.apply(self._trans) + + return Rigid(rot_inv, -1 * trn_inv) + + def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid: + """ + Apply a Tensor -> Tensor function to underlying translation and rotation tensors, mapping over the + translation/rotation dimensions respectively. + + Args: + fn: + A Tensor -> Tensor function to be mapped over the Rigid + Returns: + The transformed Rigid object + """ + new_rots = self._rots.map_tensor_fn(fn) + new_trans = torch.stack(list(map(fn, torch.unbind(self._trans, dim=-1))), dim=-1) + + return Rigid(new_rots, new_trans) + + def to_tensor_4x4(self) -> torch.Tensor: + """ + Converts a transformation to a homogeneous transformation tensor. + + Returns: + A [*, 4, 4] homogeneous transformation tensor + """ + tensor = self._trans.new_zeros((*self.shape, 4, 4)) + tensor[..., :3, :3] = self._rots.get_rot_mats() + tensor[..., :3, 3] = self._trans + tensor[..., 3, 3] = 1 + return tensor + + @staticmethod + def from_tensor_4x4(t: torch.Tensor) -> Rigid: + """ + Constructs a transformation from a homogeneous transformation tensor. + + Args: + t: [*, 4, 4] homogeneous transformation tensor + Returns: + T object with shape [*] + """ + if t.shape[-2:] != (4, 4): + raise ValueError("Incorrectly shaped input tensor") + + rots = Rotation(rot_mats=t[..., :3, :3], quats=None) + trans = t[..., :3, 3] + + return Rigid(rots, trans) + + def to_tensor_7(self) -> torch.Tensor: + """ + Converts a transformation to a tensor with 7 final columns, four for the quaternion followed by three for the + translation. + + Returns: + A [*, 7] tensor representation of the transformation + """ + tensor = self._trans.new_zeros((*self.shape, 7)) + tensor[..., :4] = self._rots.get_quats() + tensor[..., 4:] = self._trans + + return tensor + + @staticmethod + def from_tensor_7(t: torch.Tensor, normalize_quats: bool = False) -> Rigid: + if t.shape[-1] != 7: + raise ValueError("Incorrectly shaped input tensor") + + quats, trans = t[..., :4], t[..., 4:] + + rots = Rotation(rot_mats=None, quats=quats, normalize_quats=normalize_quats) + + return Rigid(rots, trans) + + @staticmethod + def from_3_points( + p_neg_x_axis: torch.Tensor, origin: torch.Tensor, p_xy_plane: torch.Tensor, eps: float = 1e-8 + ) -> Rigid: + """ + Implements algorithm 21. Constructs transformations from sets of 3 points using the Gram-Schmidt algorithm. + + Args: + p_neg_x_axis: [*, 3] coordinates + origin: [*, 3] coordinates used as frame origins + p_xy_plane: [*, 3] coordinates + eps: Small epsilon value + Returns: + A transformation object of shape [*] + """ + p_neg_x_axis_unbound = torch.unbind(p_neg_x_axis, dim=-1) + origin_unbound = torch.unbind(origin, dim=-1) + p_xy_plane_unbound = torch.unbind(p_xy_plane, dim=-1) + + e0 = [c1 - c2 for c1, c2 in zip(origin_unbound, p_neg_x_axis_unbound)] + e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane_unbound, origin_unbound)] + + denom = torch.sqrt(sum(c * c for c in e0) + eps * torch.ones_like(e0[0])) + e0 = [c / denom for c in e0] + dot = sum((c1 * c2 for c1, c2 in zip(e0, e1))) + e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)] + denom = torch.sqrt(sum(c * c for c in e1) + eps * torch.ones_like(e1[0])) + e1 = [c / denom for c in e1] + e2 = [ + e0[1] * e1[2] - e0[2] * e1[1], + e0[2] * e1[0] - e0[0] * e1[2], + e0[0] * e1[1] - e0[1] * e1[0], + ] + + rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1) + rots = rots.reshape(rots.shape[:-1] + (3, 3)) + + rot_obj = Rotation(rot_mats=rots, quats=None) + + return Rigid(rot_obj, torch.stack(origin_unbound, dim=-1)) + + def unsqueeze(self, dim: int) -> Rigid: + """ + Analogous to torch.unsqueeze. The dimension is relative to the shared dimensions of the rotation/translation. + + Args: + dim: A positive or negative dimension index. + Returns: + The unsqueezed transformation. + """ + if dim >= len(self.shape): + raise ValueError("Invalid dimension") + rots = self._rots.unsqueeze(dim) + trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1) + + return Rigid(rots, trans) + + @staticmethod + def cat(ts: Sequence[Rigid], dim: int) -> Rigid: + """ + Concatenates transformations along a new dimension. + + Args: + ts: + A list of T objects + dim: + The dimension along which the transformations should be concatenated + Returns: + A concatenated transformation object + """ + rots = Rotation.cat([t._rots for t in ts], dim) + trans = torch.cat([t._trans for t in ts], dim=dim if dim >= 0 else dim - 1) + + return Rigid(rots, trans) + + def apply_rot_fn(self, fn: Callable[[Rotation], Rotation]) -> Rigid: + """ + Applies a Rotation -> Rotation function to the stored rotation object. + + Args: + fn: A function of type Rotation -> Rotation + Returns: + A transformation object with a transformed rotation. + """ + return Rigid(fn(self._rots), self._trans) + + def apply_trans_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid: + """ + Applies a Tensor -> Tensor function to the stored translation. + + Args: + fn: + A function of type Tensor -> Tensor to be applied to the translation + Returns: + A transformation object with a transformed translation. + """ + return Rigid(self._rots, fn(self._trans)) + + def scale_translation(self, trans_scale_factor: float) -> Rigid: + """ + Scales the translation by a constant factor. + + Args: + trans_scale_factor: + The constant factor + Returns: + A transformation object with a scaled translation. + """ + return self.apply_trans_fn(lambda t: t * trans_scale_factor) + + def stop_rot_gradient(self) -> Rigid: + """ + Detaches the underlying rotation object + + Returns: + A transformation object with detached rotations + """ + return self.apply_rot_fn(lambda r: r.detach()) + + @staticmethod + def make_transform_from_reference( + n_xyz: torch.Tensor, ca_xyz: torch.Tensor, c_xyz: torch.Tensor, eps: float = 1e-20 + ) -> Rigid: + """ + Returns a transformation object from reference coordinates. + + Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard + way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You + need to take care of such cases in your code. + + Args: + n_xyz: A [*, 3] tensor of nitrogen xyz coordinates. + ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates. + c_xyz: A [*, 3] tensor of carbon xyz coordinates. + Returns: + A transformation object. After applying the translation and rotation to the reference backbone, the + coordinates will approximately equal to the input coordinates. + """ + translation = -1 * ca_xyz + n_xyz = n_xyz + translation + c_xyz = c_xyz + translation + + c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)] + norm = torch.sqrt(eps + c_x**2 + c_y**2) + sin_c1 = -c_y / norm + cos_c1 = c_x / norm + + c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3)) + c1_rots[..., 0, 0] = cos_c1 + c1_rots[..., 0, 1] = -1 * sin_c1 + c1_rots[..., 1, 0] = sin_c1 + c1_rots[..., 1, 1] = cos_c1 + c1_rots[..., 2, 2] = 1 + + norm = torch.sqrt(eps + c_x**2 + c_y**2 + c_z**2) + sin_c2 = c_z / norm + cos_c2 = torch.sqrt(c_x**2 + c_y**2) / norm + + c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3)) + c2_rots[..., 0, 0] = cos_c2 + c2_rots[..., 0, 2] = sin_c2 + c2_rots[..., 1, 1] = 1 + c2_rots[..., 2, 0] = -1 * sin_c2 + c2_rots[..., 2, 2] = cos_c2 + + c_rots = rot_matmul(c2_rots, c1_rots) + n_xyz = rot_vec_mul(c_rots, n_xyz) + + _, n_y, n_z = [n_xyz[..., i] for i in range(3)] + norm = torch.sqrt(eps + n_y**2 + n_z**2) + sin_n = -n_z / norm + cos_n = n_y / norm + + n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3)) + n_rots[..., 0, 0] = 1 + n_rots[..., 1, 1] = cos_n + n_rots[..., 1, 2] = -1 * sin_n + n_rots[..., 2, 1] = sin_n + n_rots[..., 2, 2] = cos_n + + rots = rot_matmul(n_rots, c_rots) + + rots = rots.transpose(-1, -2) + translation = -1 * translation + + rot_obj = Rotation(rot_mats=rots, quats=None) + + return Rigid(rot_obj, translation) + + def cuda(self) -> Rigid: + """ + Moves the transformation object to GPU memory + + Returns: + A version of the transformation on GPU + """ + return Rigid(self._rots.cuda(), self._trans.cuda()) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..449c810aed3f51b34a96674f29d5dc8bdfc35103 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py @@ -0,0 +1,140 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from typing import Any, Callable, TypeVar, Union, overload + +import torch +import torch.nn as nn +import torch.types + + +def add(m1: torch.Tensor, m2: torch.Tensor, inplace: bool) -> torch.Tensor: + # The first operation in a checkpoint can't be in-place, but it's + # nice to have in-place addition during inference. Thus... + if not inplace: + m1 = m1 + m2 + else: + m1 += m2 + + return m1 + + +def permute_final_dims(tensor: torch.Tensor, inds: list[int]) -> torch.Tensor: + zero_index = -1 * len(inds) + first_inds = list(range(len(tensor.shape[:zero_index]))) + return tensor.permute(first_inds + [zero_index + i for i in inds]) + + +def flatten_final_dims(t: torch.Tensor, no_dims: int) -> torch.Tensor: + return t.reshape(t.shape[:-no_dims] + (-1,)) + + +def masked_mean(mask: torch.Tensor, value: torch.Tensor, dim: int, eps: float = 1e-4) -> torch.Tensor: + mask = mask.expand(*value.shape) + return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim)) + + +def pts_to_distogram( + pts: torch.Tensor, min_bin: torch.types.Number = 2.3125, max_bin: torch.types.Number = 21.6875, no_bins: int = 64 +) -> torch.Tensor: + boundaries = torch.linspace(min_bin, max_bin, no_bins - 1, device=pts.device) + dists = torch.sqrt(torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)) + return torch.bucketize(dists, boundaries) + + +def dict_multimap(fn: Callable[[list], Any], dicts: list[dict]) -> dict: + first = dicts[0] + new_dict = {} + for k, v in first.items(): + all_v = [d[k] for d in dicts] + if isinstance(v, dict): + new_dict[k] = dict_multimap(fn, all_v) + else: + new_dict[k] = fn(all_v) + + return new_dict + + +def one_hot(x: torch.Tensor, v_bins: torch.Tensor) -> torch.Tensor: + reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),)) + diffs = x[..., None] - reshaped_bins + am = torch.argmin(torch.abs(diffs), dim=-1) + return nn.functional.one_hot(am, num_classes=len(v_bins)).float() + + +def batched_gather(data: torch.Tensor, inds: torch.Tensor, dim: int = 0, no_batch_dims: int = 0) -> torch.Tensor: + ranges: list[Union[slice, torch.Tensor]] = [] + for i, s in enumerate(data.shape[:no_batch_dims]): + r = torch.arange(s) + r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1)))) + ranges.append(r) + + remaining_dims: list[Union[slice, torch.Tensor]] = [slice(None) for _ in range(len(data.shape) - no_batch_dims)] + remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds + ranges.extend(remaining_dims) + # Matt note: Editing this to get around the behaviour of using a list as an array index changing + # in recent Numpy versions + return data[tuple(ranges)] + + +T = TypeVar("T") + + +# With tree_map, a poor man's JAX tree_map +def dict_map( + fn: Callable[[T], Any], dic: dict[Any, Union[dict, list, tuple, T]], leaf_type: type[T] +) -> dict[Any, Union[dict, list, tuple, Any]]: + new_dict: dict[Any, Union[dict, list, tuple, Any]] = {} + for k, v in dic.items(): + if isinstance(v, dict): + new_dict[k] = dict_map(fn, v, leaf_type) + else: + new_dict[k] = tree_map(fn, v, leaf_type) + + return new_dict + + +@overload +def tree_map(fn: Callable[[T], Any], tree: T, leaf_type: type[T]) -> Any: ... + + +@overload +def tree_map(fn: Callable[[T], Any], tree: dict, leaf_type: type[T]) -> dict: ... + + +@overload +def tree_map(fn: Callable[[T], Any], tree: list, leaf_type: type[T]) -> list: ... + + +@overload +def tree_map(fn: Callable[[T], Any], tree: tuple, leaf_type: type[T]) -> tuple: ... + + +def tree_map(fn, tree, leaf_type): + if isinstance(tree, dict): + return dict_map(fn, tree, leaf_type) + elif isinstance(tree, list): + return [tree_map(fn, x, leaf_type) for x in tree] + elif isinstance(tree, tuple): + return tuple(tree_map(fn, x, leaf_type) for x in tree) + elif isinstance(tree, leaf_type): + return fn(tree) + else: + print(type(tree)) + raise TypeError("Not supported") + + +tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36481e69b4378636f90d4ac0a6a54a2242af91ee Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a7f86a511573c58cf7751e6ad6eae741acee15a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aef1d0dcfa605387a8a4287d2a751d3e551a0f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..203c1a64a3930f9c073dc36480d3ce9281070ca0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e33a3612c733ddc6d9f513509cc8985db1901b0f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef432bf5eadbeff7c8cfb2c9713accd363220a45 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f944ee5d63d20ac4f5e457dfb99188cb59531772 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d79ecb912c7291e7594e5308b1014ce1c8942ed9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47b57d1de5d8f87b76b60d03d017d8a5141c3f58 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b399d000efcb580c1bd878bd086c25fd55c2c2f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02162f757a9ecb1cbe3fb01e74431304fd06731b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93c752a2fa5afc645a9a9f00c6bd234812d67d77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..baf1b2935e3fccd6e34a23012923c5c169ddb5cc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6497de5149931f8d9c302661cd6d9b2479d6897 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0782eceb1c67760538352daabb53331970a2913c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6a2a5deab4aa46a58b188977c9bf3335522e545 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42f83bf9cf3b4328acce2e29089f6ab5c435f938 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a26070726d090a5f5914384f3626c8af6af39f06 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25ee890c7fd29f04b015394d50b9bf498ffc09bc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b7a08eef7ab6fe8e3ee0a3474d469e64a33e531 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3785c2af8b6373bb4ceff8bc0df139115c02fcf1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff7da20da4b31ac113a76fa6aaaa1a82034d9a72 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b846639274a5ff466a1205aa4542bf2518d4a7c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22de82d9e96490c1319fbe47d4ca3c4c4966849f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/configuration_flex_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/configuration_flex_olmo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b47044534f5fd840cffda9cd07e290dd4fa38ef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/configuration_flex_olmo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modeling_flex_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modeling_flex_olmo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce1a4813ef4f3d213fe202a97ff4e3640e3616b5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modeling_flex_olmo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modular_flex_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modular_flex_olmo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..036555ab83006f455e4a0314541469169ce813dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modular_flex_olmo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5883d5bf73d4de6ffa708d022863df7b11714337 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..223e02718101368fdc175eb2663831fc6a481a5f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b556738d5aeaa6e6b8ecbce7e47cbbb50e2f9835 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c2fbc8d389d00fb4a2a7e76bba92e31fa2e795e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25500da7548e0382094d3ad819abf014b051e76e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..631002676875ecc5a198e8499b34fc90f2245414 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c467acac282db11fc7426e57d391243f0dff97f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f719d800084d58c9fffa884697bc81e7beb95832 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6b2aa9101677aa54c3f339265587d7e3440271b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03e3a38873cb4598c8d77ea8fc048e638614e19f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..197d4a0d95020b59ccd15681d570798a7184181e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f604bac921a9fbd46f0e87347c6a04c29d72a32a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc0d442764ced5e684f5e8953ebe9402b08f21a0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e9faacd3dfeb611c8ea8260a6a5409e667c32b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..517e0f4754b09e805a51d8434ffe2c8a40451ed5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c0352fa0279d23a8643a9207b1ecec1cd364a2c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..920dd703bb1d30fe013b1acbcf0690b68a2ca804 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..744cd073da9d6a6905b73f69d9bcc443961177fe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9939bc40f83ca6006cdae305c005f1e26da8c26b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5dea69bfeb7b2106c300e1ced8e1609c889f620 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3abab8b48efcbebb6a71565a6fb2432ec6e1779 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bde7a93e0551438b05a9eb6ffb6479976561324 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..959d895083cffdb7387168a09fc709ab0b8e2a4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f3a9cc68ce29a9a2ade8b78d355815f70fa6e98 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b71042e7fbb21a76fe3cd955bd345bfa35b26f0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.py new file mode 100644 index 0000000000000000000000000000000000000000..bc037912c5c52b869099fd4c35ff1bd4ee0fd1c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.py @@ -0,0 +1,1460 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch GIT model.""" + +import math +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPast, + BaseModelOutputWithPooling, + CausalLMOutputWithPast, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + auto_docstring, + can_return_tuple, + logging, + torch_int, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_git import GitConfig, GitVisionConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + """ +) +# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Git +class GitVisionModelOutput(ModelOutput): + r""" + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +class GitEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + embeddings = self.word_embeddings(input_ids) + else: + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class GitSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.image_patch_tokens = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1) + if config.num_image_with_embedding is not None: + self.image_patch_tokens *= config.num_image_with_embedding + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + cutoff = self.image_patch_tokens if pixel_values_present else 0 + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + if past_key_values is not None: + # NOTE: like in other caches, we store the text component. In GIT it means we discard the image component. + key_layer_past, value_layer_past = past_key_values.update( + key_layer[:, :, cutoff:, :], value_layer[:, :, cutoff:, :], self.layer_idx + ) + key_layer = torch.cat([key_layer[:, :, :cutoff, :], key_layer_past], dim=2) + value_layer = torch.cat([value_layer[:, :, :cutoff, :], value_layer_past], dim=2) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if past_key_values is not None: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GitModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +class GitSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +GIT_SELF_ATTENTION_CLASSES = { + "eager": GitSelfAttention, +} + + +class GitAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type, layer_idx=layer_idx + ) + self.output = GitSelfOutput(config) + self.pruned_heads = set() + + # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + attn_output, self_attn_weights = self.self( + hidden_states, + attention_mask, + head_mask, + past_key_values, + output_attentions, + pixel_values_present, + ) + attention_output = self.output(attn_output, hidden_states) + return attention_output, self_attn_weights + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate +class GitIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput +class GitOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class GitLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = GitAttention(config, layer_idx=layer_idx) + self.intermediate = GitIntermediate(config) + self.output = GitOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + attention_output, self_attention_weights = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + pixel_values_present=pixel_values_present, + ) + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + return layer_output, self_attention_weights + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class GitEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([GitLayer(config, i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]: + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + past_key_values, + output_attentions, + pixel_values_present, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@auto_docstring +class GitPreTrainedModel(PreTrainedModel): + config: GitConfig + base_model_prefix = "git" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, GitVisionEmbeddings): + nn.init.normal_(module.class_embedding, mean=0.0, std=self.config.initializer_range) + nn.init.normal_(module.patch_embedding.weight, std=self.config.initializer_range) + nn.init.normal_(module.position_embedding.weight, std=self.config.initializer_range) + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Git +class GitVisionEmbeddings(nn.Module): + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})." + ) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class GitVisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class GitVisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + self.is_causal = False + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, seq_length, embed_dim = hidden_states.shape + + queries = self.q_proj(hidden_states) + keys = self.k_proj(hidden_states) + values = self.v_proj(hidden_states) + + queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + # CLIP text model uses both `causal_attention_mask` and `attention_mask` + # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` + if self.config._attn_implementation != "flash_attention_2": + if attention_mask is not None and causal_attention_mask is not None: + attention_mask = attention_mask + causal_attention_mask + elif causal_attention_mask is not None: + attention_mask = causal_attention_mask + else: + self.is_causal = causal_attention_mask is not None + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + queries, + keys, + values, + attention_mask, + is_causal=self.is_causal, + scaling=self.scale, + dropout=0.0 if not self.training else self.dropout, + ) + + attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + if not output_attentions: + attn_weights = None + return attn_output, attn_weights + + +# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->GitVision +class GitVisionEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: GitVisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = GitVisionAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = GitVisionMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->GitVision, CLIPConfig +class GitVisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`GitVisionEncoderLayer`]. + + Args: + config: GitVisionConfig + """ + + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + @can_return_tuple + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class GitVisionTransformer(nn.Module): + # Copied from transformers.models.altclip.modeling_altclip.AltCLIPVisionTransformer.__init__ with AltCLIPEncoder->GitVisionEncoder, AltCLIP->Git + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = GitVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = GitVisionEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=last_hidden_state, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The vision model from CLIP, used in GIT, without any head or projection on top. + """ +) +class GitVisionModel(GitPreTrainedModel): + config: GitVisionConfig + main_input_name = "pixel_values" + + # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git + def __init__(self, config: GitVisionConfig): + super().__init__(config) + self.vision_model = GitVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, GitVisionModel + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base") + >>> model = GitVisionModel.from_pretrained("microsoft/git-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + +class GitProjection(nn.Module): + def __init__(self, config: GitConfig): + super().__init__() + self.config = config + self.visual_projection = nn.Sequential( + nn.Linear(config.vision_config.hidden_size, config.hidden_size), + nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps), + ) + + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: + return self.visual_projection(embeddings) + + +@auto_docstring( + custom_intro=""" + The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states + """ +) +class GitModel(GitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.embeddings = GitEmbeddings(config) + self.image_encoder = GitVisionModel(config.vision_config) + self.encoder = GitEncoder(config) + + self.visual_projection = GitProjection(config) + + if config.num_image_with_embedding is not None: + self.img_temporal_embedding = nn.ParameterList( + nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size)) + for _ in range(config.num_image_with_embedding) + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def _generate_future_mask(self, size: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + # Default mask is for forward direction. Flip for backward direction. + mask = torch.triu(torch.ones(size, size, device=device, dtype=dtype), diagonal=1) + mask = mask.masked_fill(mask == 1, float("-inf")) + return mask + + def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None): + num_tgt = tgt.shape[1] + num_memory = memory.shape[1] + device = tgt.device + dtype = tgt.dtype + top_left = torch.zeros((num_memory, num_memory), device=device, dtype=dtype) + top_right = torch.full( + (num_memory, num_tgt + past_key_values_length), + float("-inf"), + device=tgt.device, + dtype=dtype, + ) + bottom_left = torch.zeros( + (num_tgt, num_memory), + dtype=dtype, + device=tgt_mask.device, + ) + + if past_key_values_length > 0: + tgt_mask = torch.zeros( + (tgt_mask.shape[0], tgt_mask.shape[0] + past_key_values_length), + dtype=dtype, + device=tgt_mask.device, + ) + + left = torch.cat((top_left, bottom_left), dim=0) + right = torch.cat((top_right, tgt_mask.to(dtype)), dim=0) + + full_attention_mask = torch.cat((left, right), dim=1)[None, :] + + if memory_key_padding_mask is None: + memory_key_padding_mask = torch.full((memory.shape[0], memory.shape[1]), fill_value=False, device=device) + # if it is False, it means valid. That is, it is not a padding + if memory_key_padding_mask.dtype != torch.bool: + raise ValueError("Memory key padding mask must be a boolean tensor.") + zero_negative_infinity = torch.zeros_like(memory_key_padding_mask, dtype=tgt.dtype) + zero_negative_infinity[memory_key_padding_mask] = float("-inf") + full_attention_mask = full_attention_mask.expand( + (memory_key_padding_mask.shape[0], num_memory + num_tgt, num_memory + past_key_values_length + num_tgt) + ) + full_attention_mask = full_attention_mask.clone() + origin_left = full_attention_mask[:, :, :num_memory] + update = zero_negative_infinity[:, None, :] + full_attention_mask[:, :, :num_memory] = origin_left + update + + # add axis for multi-head + full_attention_mask = full_attention_mask[:, None, :, :] + + return full_attention_mask + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]: + r""" + Examples: + + ```python + >>> from transformers import AutoProcessor, AutoModel + >>> import requests + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base") + >>> model = AutoModel.from_pretrained("microsoft/git-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> text = "this is an image of two cats" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length = input_shape[1] + + # past_key_values_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values.get_seq_length() + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + projected_visual_features = None + if pixel_values is not None: + if pixel_values.ndim == 4: + # here we assume pixel_values is of shape (batch_size, num_channels, height, width) + visual_features = self.image_encoder( + pixel_values, interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state + + elif pixel_values.ndim == 5: + # here we assume pixel_values is of shape (batch_size, num_frames, num_channels, height, width) + visual_features = [] + for frame_idx in range(pixel_values.shape[1]): + visual_features_frame = self.image_encoder( + pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state + visual_features_frame += self.img_temporal_embedding[frame_idx] + visual_features.append(visual_features_frame) + + # finally, concatenate all features along sequence dimension + visual_features = torch.cat(visual_features, dim=1) + + else: + raise ValueError("pixel_values must be of rank 4 or 5") + + projected_visual_features = self.visual_projection(visual_features) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if projected_visual_features is None: + projected_visual_features = torch.zeros( + (embedding_output.shape[0], 0, embedding_output.shape[2]), + dtype=embedding_output.dtype, + device=embedding_output.device, + ) + + # Repeat visual features to match embedding batch size. + projected_visual_features = projected_visual_features.repeat( + embedding_output.size(0) // projected_visual_features.size(0), 1, 1 + ) + + # concatenate patch token and text token embeddings + hidden_states = torch.cat((projected_visual_features, embedding_output), dim=1) + + # By default, an additive causal mask is created + # for masking the future (one direction). + tgt_mask = self._generate_future_mask(seq_length, embedding_output.dtype, embedding_output.device) + + # Create an attention mask of shape (batch_size, 1, tgt_seq_len, src_seq_len) + combined_attention_mask = self.create_attention_mask( + tgt=embedding_output, + memory=projected_visual_features, + tgt_mask=tgt_mask, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # if the user provides an attention mask, we add it to the default one + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _prepare_4d_attention_mask( + attention_mask, embedding_output.dtype, tgt_len=input_shape[-1] + ).to(embedding_output.device) + if past_key_values_length > 0: + expanded_attn_mask = expanded_attn_mask[:, :, -past_key_values_length:, :] + else: + combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=combined_attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + pixel_values_present=pixel_values is not None, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutputWithPast( + last_hidden_state=sequence_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + GIT Model with a `language modeling` head on top for autoregressive language modeling. + """ +) +class GitForCausalLM(GitPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["output.weight"] + + def __init__(self, config): + super().__init__(config) + + self.git = GitModel(config) + self.output = nn.Linear(config.hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, list[torch.Tensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + + Examples: + + Image captioning example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM + >>> import requests + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values + + >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50) + >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + >>> print(generated_caption) + two cats sleeping on a pink blanket next to remotes. + ``` + + Visual question answering (VQA) example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM + >>> from huggingface_hub import hf_hub_download + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa") + + >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") + >>> image = Image.open(file_path).convert("RGB") + + >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values + + >>> question = "what does the front of the bus say at the top?" + + >>> input_ids = processor(text=question, add_special_tokens=False).input_ids + >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids + >>> input_ids = torch.tensor(input_ids).unsqueeze(0) + + >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) + >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True)) + ['what does the front of the bus say at the top? special'] + ``` + + Video captioning example: + + ```python + >>> import av + >>> import numpy as np + >>> from PIL import Image + >>> from huggingface_hub import hf_hub_download + >>> from transformers import AutoProcessor, AutoModelForCausalLM + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex") + + >>> # set seed for reproducibility + >>> np.random.seed(45) + + + >>> def read_video_pyav(container, indices): + ... ''' + ... Decode the video with PyAV decoder. + ... Args: + ... container (`av.container.input.InputContainer`): PyAV container. + ... indices (`list[int]`): List of frame indices to decode. + ... Returns: + ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ... ''' + ... frames = [] + ... container.seek(0) + ... start_index = indices[0] + ... end_index = indices[-1] + ... for i, frame in enumerate(container.decode(video=0)): + ... if i > end_index: + ... break + ... if i >= start_index and i in indices: + ... frames.append(frame) + ... return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + + >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + ... ''' + ... Sample a given number of frame indices from the video. + ... Args: + ... clip_len (`int`): Total number of frames to sample. + ... frame_sample_rate (`int`): Sample every n-th frame. + ... seg_len (`int`): Maximum allowed index of sample's last frame. + ... Returns: + ... indices (`list[int]`): List of sampled frame indices + ... ''' + ... converted_len = int(clip_len * frame_sample_rate) + ... end_idx = np.random.randint(converted_len, seg_len) + ... start_idx = end_idx - converted_len + ... indices = np.linspace(start_idx, end_idx, num=clip_len) + ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + ... return indices + + + >>> # load video + >>> file_path = hf_hub_download( + ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ... ) + >>> container = av.open(file_path) + + >>> # sample frames + >>> num_frames = model.config.num_image_with_embedding + >>> indices = sample_frame_indices( + ... clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames + ... ) + >>> frames = read_video_pyav(container, indices) + + >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values + + >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50) + + >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True)) + Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.'] + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.git( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + pixel_values=pixel_values, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.output(sequence_output) + + loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + num_image_tokens = self.git.encoder.layer[0].attention.self.image_patch_tokens + shifted_logits = logits[:, num_image_tokens:-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss = self.loss_function( + shifted_logits.view(-1, self.config.vocab_size), + labels.view(-1), + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs + ): + # Overwritten -- `git` has special cache handling and doesn't support generating from `inputs_embeds` atm + + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + input_shape = input_ids.shape + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": kwargs.get("pixel_values"), + "past_key_values": past_key_values, + "use_cache": use_cache, + } + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + +__all__ = ["GitForCausalLM", "GitModel", "GitPreTrainedModel", "GitVisionModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1a73513e3caacdd62de5327c2ee1022fefce378 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e91b45da2235a6d1d3b99d08d537b7f06390d7a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2fb09f675c2fa217e5f8fda966cc679dbc47157 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03b9d90dc91cb9d50d92187af3d60221328e7315 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ab15356ef75e8267fad42da3edda58cd9fd8e8f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c9b91b2c416af487b83521997e60a8ad2ffa8bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59e6802be823cb67816897a7c39317a395a0f537 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a898ef5d8b92b1d09bcc80cdb71266a881a7bda0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27ce6575bc2f40110c904e1200edf9d67c31361f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5a088d10f5c76828fccb5be3957a5ac8e83067a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c509afc040ae9dcff9012873f7f57ac0f4170fb2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dadaff59bcd156745e612f4f093ea2cc5a689219 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0757b63702b9e9bdb3ba967ddecbd26ca990528b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..468020f65f0b839c7a27be874901259ddb9c3b3e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6bd4bfa01b6cb3ba130123e877adedb6ef6dbcc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0a2e9104b900684e14042fa79d480a86df4989f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..474baba5fd562256e91d30a59377114ebe723cad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36fc1e9e757757c6d0198d56dbb9c4976ac09662 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..379c6bf23f28c0227a45f2f2ebebf958cecba335 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49c9146c62e39e67c83dce58894a277c9eec9a21 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ece9e529f844915862650cec50fe99b434c46e14 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c122aae93d03319b7c38a67ee2ca35656f69392 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26057ea532231512299b710df7379fef62e794fb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff132753812b189d57a46cf0541c3becaa0b3d5b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab28233ccaaac9297d57a162c06dd2f29a5b2365 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c294ab8fcf00daf2202437891a493853a1f71845 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74af609b3aa07215b27a4960aff70b2e0602b9bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d925ae82b0b75e9b47f28c3c502cd85afa9e75e8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ad918657f9bd51a1945a36aa4417692f1996dfc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c67d9e3b6d0a6da75b0da0975ae330883b1a268 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97974b26e32409c419e8e47fa178497ed8f1fa52 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..374b61f801c73905ae7f0b70755794867e51763a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac02f97f9fed5f0ee72832b5648b15c42292bbbb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..218edcb605020381f01b082cb57bb4bc84d9b0c0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92ff2664a9cb494d3ef26155a79cb7e78b4c3f0d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..956087c1beaecbf1754ca8c504e4cd7bd213e09d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..238602497876cd50f479a45f879abfdf515f0f8c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..193222f28b7294887f17304fa61ec4d3072e02a0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..066768e78fe56b85d87b6bd612985978d4cf0be0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..834f27da1e63e2e95a5fa5585fbae5b6b580a453 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49fb0aef84fb229e8a7ddb275bf16b4d5a24350e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1850bf4c176228c4f6cd8c803f48cc54d2f225dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94967f0a8373a352f5a6757ef1820bf8d669e90a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4655cf52a4a13d93309e96be0a96b5a4b59780ec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1df1f4dcb48c5a540d1fa92902c4495d3aacaad Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28dec61f0dbeff4be018ddf61bc2857049f698ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b13df60442756fd7b84ddd7d67593cb2bf6f5beb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbf563c4964af7dea71ccc8956dd880ca34baf2e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5583717fdf5480c86324556a2a2fbf58b17660c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..961ed6016cc622306dca93770e721c55ef58f552 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa55a9b0d08d21e12cb74c4f17935a2e77885773 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65559a56f6df09ac2c5bfb32bdcef9dddcf97594 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f6c83fbe9199fc0f60a995e224e8b50e669332d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c265911b80a34d9084681cc03d765b386716dfed Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57f2a27cd65741f53538054b741466129714556f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a679e9f918a7271b312cdaed329690b5cfe985cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..106dc5cd925103f710cbaba498f21a54b264fa98 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b3e6c5696c90dc6c5202280ea6cbabb486a8e5c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6df34aa43f456f52d231db63025dab2212410c8a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92f597d99e76d686e7bb69f8c3faa28c090b4d93 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e40037c788a4f0e380367ad9b3a85aa70de4e16f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0db23e82b2d7847d2614b2443ca5e58ec614cc86 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..841f13be4c0d2f48f54eecc916acd826395449af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_hiera import * + from .modeling_hiera import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/configuration_hiera.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/configuration_hiera.py new file mode 100644 index 0000000000000000000000000000000000000000..2342d7e562a50de0c0937040a8e8279c7860e931 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/configuration_hiera.py @@ -0,0 +1,194 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Hiera model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class HieraConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Hiera + [facebook/hiera-base-224](https://huggingface.co/facebook/hiera-base-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + embed_dim (`int`, *optional*, defaults to 96): + Dimensionality of patch embedding. + image_size (`list(int)`, *optional*, defaults to `[224, 224]`): + The size (resolution) of input in the format (height, width) for images + and (frames, height, width) for videos. + patch_size (`list(int)`, *optional*, defaults to `[7, 7]`): + The size (resolution) of each patch. + patch_stride (`list(int)`, *optional*, defaults to `[4, 4]`): + The stride of the patch. + patch_padding (`list(int)`, *optional*, defaults to `[3, 3]`): + The padding of the patch. + mlp_ratio (`float`, *optional*, defaults to 4.0): + The ratio of mlp hidden dim to embedding dim. + depths (`list(int)`, *optional*, defaults to `[2, 3, 16, 3]`): + Depth of each layer in the Transformer encoder. + num_heads (`list(int)`, *optional*, defaults to `[1, 2, 4, 8]`): + Number of attention heads in each layer of the Transformer encoder. + embed_dim_multiplier (`float`, *optional*, defaults to 2.0): + The multiplier to the dimensionality of patch embedding in each layer of the Transformer encoder. + num_query_pool (`int`, *optional*, defaults to 3): + The number of query pool stages. + query_stride (`list(int)`, *optional*, defaults to `[2, 2]`): + The stride of the query pool. + masked_unit_size (`list(int)`, *optional*, defaults to `[8, 8]`): + The size of the masked unit. + masked_unit_attention (`list(bool)`, *optional*, defaults to `[True, True, False, False]`): + Whether to use masked unit attention in each layer of the Transformer encoder. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The drop path rate. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices and + the zero_initializer for initializing all bias vectors. + layer_norm_init (`float`, *optional*, defaults to 1.0): + The initial weight value for layer normalization layers. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + decoder_hidden_size (`int`, *optional*): + Dimensionality of decoder embeddings for MAE pretraining. + decoder_depth (`int`, *optional*): + Depth of the decoder for MAE pretraining. + decoder_num_heads (`int`, *optional*): + Number of attention heads in each layer of the decoder for MAE pretraining. + normalize_pixel_loss (`bool`, *optional*, defaults to `True`): + Whether to normalize the pixel loss by the number of pixels. + mask_ratio (`float`, *optional*, defaults to 0.6): + The ratio of masked tokens in the input. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + + + Example: + + ```python + >>> from transformers import HieraConfig, HieraModel + + >>> # Initializing a Hiera hiera-base-patch16-224 style configuration + >>> configuration = HieraConfig() + + >>> # Initializing a model (with random weights) from the hiera-base-patch16-224 style configuration + >>> model = HieraModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "hiera" + + attribute_map = {"num_hidden_layers": "num_layers"} + + def __init__( + self, + embed_dim=96, + image_size=[224, 224], + patch_size=[7, 7], + patch_stride=[4, 4], + patch_padding=[3, 3], + mlp_ratio=4.0, + depths=[2, 3, 16, 3], + num_heads=[1, 2, 4, 8], + embed_dim_multiplier=2.0, + num_query_pool=3, + query_stride=[2, 2], + masked_unit_size=[8, 8], + masked_unit_attention=[True, True, False, False], + drop_path_rate=0.0, + num_channels=3, + hidden_act="gelu", + initializer_range=0.02, + layer_norm_init=1.0, + layer_norm_eps=1e-6, + decoder_hidden_size=None, + decoder_depth=None, + decoder_num_heads=None, + normalize_pixel_loss=True, + mask_ratio=0.6, + out_features=None, + out_indices=None, + **kwargs, + ): + super().__init__(**kwargs) + if masked_unit_size[0] % query_stride[0] ** (len(depths) - 1) != 0: + raise ValueError( + f"masked_unit_size[0] ({masked_unit_size[0]}) must be divisible by query_stride[0] ({query_stride[0]}) " + f"raised to the power of the number of layers ({len(depths) - 1})" + ) + + if num_query_pool >= len(depths): + raise ValueError( + f"num_query_pool ({num_query_pool}) must be less than the number of layers ({len(depths)})" + ) + + self.embed_dim = embed_dim + self.image_size = image_size + self.patch_size = patch_size + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.mlp_ratio = mlp_ratio + self.depths = depths + self.num_heads = num_heads + self.num_layers = len(depths) + self.embed_dim_multiplier = embed_dim_multiplier + self.num_query_pool = num_query_pool + self.query_stride = query_stride + self.masked_unit_size = masked_unit_size + self.masked_unit_attention = masked_unit_attention + self.drop_path_rate = drop_path_rate + self.num_channels = num_channels + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_init = layer_norm_init + self.layer_norm_eps = layer_norm_eps + self.decoder_hidden_size = decoder_hidden_size + self.decoder_depth = decoder_depth + self.decoder_num_heads = decoder_num_heads + self.normalize_pixel_loss = normalize_pixel_loss + self.mask_ratio = mask_ratio + # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(embed_dim * embed_dim_multiplier ** (len(depths) - 1)) + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + + +__all__ = ["HieraConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/modeling_hiera.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/modeling_hiera.py new file mode 100644 index 0000000000000000000000000000000000000000..0c084f0f836e18c4891491ea983676e9752e80a2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/modeling_hiera.py @@ -0,0 +1,1439 @@ +# coding=utf-8 +# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Hiera model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BackboneOutput, + BaseModelOutput, + BaseModelOutputWithPooling, + ImageClassifierOutput, + ModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging, torch_int +from ...utils.backbone_utils import BackboneMixin +from .configuration_hiera import HieraConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Hiera encoder's outputs, with potential hidden states and attentions. + """ +) +class HieraEncoderOutput(ModelOutput): + r""" + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Hiera model's outputs that also contains a pooling of the last hidden states. + """ +) +class HieraModelOutput(ModelOutput): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): + Average pooling of the last layer hidden-state. + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Tensor indicating which patches are masked (0) and which are not (1). + ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Tensor containing the original index of the (shuffled) masked patches. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + bool_masked_pos: Optional[torch.BoolTensor] = None + ids_restore: Optional[torch.LongTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Hiera image classification outputs. + """ +) +class HieraForImageClassificationOutput(ImageClassifierOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, `optional`): + Loss value for the training task. + logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`): + Prediction scores of the classification head (logits of the output layer). + hidden_states (`tuple(torch.FloatTensor)`, `optional`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, `optional`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for HieraForPreTraining's outputs, with potential hidden states and attentions. + """ +) +class HieraForPreTrainingOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`): + Pixel reconstruction loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`): + Pixel reconstruction logits. + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Tensor indicating which patches are masked (0) and which are not (1). + ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Tensor containing the original index of the (shuffled) masked patches. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs reshaped to include the spatial dimensions. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + bool_masked_pos: Optional[torch.BoolTensor] = None + ids_restore: Optional[torch.LongTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +class HieraPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config, is_mae: bool = False): + super().__init__() + + # Support any number of spatial dimensions + self.spatial_dims = len(config.patch_size) + if self.spatial_dims != 2: + raise ValueError(f"The number of dimensions of the input image should be 2, but got {self.spatial_dims}.") + self.num_channels = config.num_channels + self.image_size = config.image_size[-2:] + self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)] + self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)] + self.mask_ratio = config.mask_ratio + self.is_mae = is_mae + self.projection = nn.Conv2d( + self.num_channels, + config.embed_dim, + kernel_size=config.patch_size, + stride=config.patch_stride, + padding=config.patch_padding, + ) + + def masked_conv( + self, pixel_values: torch.FloatTensor, bool_masked_pos: Optional[torch.BoolTensor] = None + ) -> torch.Tensor: + """Zero-out the masked regions of the input before conv. + Prevents leakage of masked regions when using overlapping kernels. + """ + if bool_masked_pos is None: + return self.projection(pixel_values) + + target_size = pixel_values.shape[2:] + # Reshape bool_masked_pos to (batch_size, 1, mask_unit_height, mask_unit_width) + bool_masked_pos = bool_masked_pos.view(pixel_values.shape[0], 1, *self.mask_spatial_shape) + + bool_masked_pos = nn.functional.interpolate(bool_masked_pos.float(), size=target_size) + + return self.projection(pixel_values * bool_masked_pos) + + def random_masking( + self, pixel_values: torch.FloatTensor, noise: Optional[torch.FloatTensor] = None + ) -> tuple[torch.BoolTensor, torch.LongTensor]: + """ + Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random + noise. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`) + noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is + mainly used for testing purposes to control randomness and maintain the reproducibility + """ + batch_size = pixel_values.shape[0] + # Tokens selected for masking at mask unit level + num_windows = math.prod(self.mask_spatial_shape) + len_keep = int(num_windows * (1 - self.mask_ratio)) + + if noise is None: + noise = torch.rand(batch_size, num_windows, device=pixel_values.device) + + # Sort noise for each sample + ids_shuffle = torch.argsort(noise, dim=1) + # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1).to(pixel_values.device) + + # Generate the binary bool_masked_pos: 1 is *keep*, 0 is *remove* + # Note this is opposite to original MAE + bool_masked_pos = torch.zeros([batch_size, num_windows], device=pixel_values.device) + bool_masked_pos[:, :len_keep] = 1 + # Unshuffle to get the binary bool_masked_pos + bool_masked_pos = torch.gather(bool_masked_pos, dim=1, index=ids_restore).bool() + + return bool_masked_pos, ids_restore + + def forward( + self, + pixel_values: torch.FloatTensor, + noise: Optional[torch.FloatTensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]: + (bool_masked_pos, ids_restore) = ( + self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None) + ) + + embeddings = self.masked_conv(pixel_values, bool_masked_pos) + embeddings = embeddings.flatten(2).transpose(2, 1) + + return embeddings, bool_masked_pos, ids_restore + + +class HieraEmbeddings(nn.Module): + """ + Construct position and patch embeddings. + """ + + def __init__(self, config: HieraConfig, is_mae: bool = False) -> None: + super().__init__() + self.patch_stride = config.patch_stride + tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)] + self.mask_spatial_shape = [i // s for i, s in zip(tokens_spatial_shape, config.masked_unit_size)] + self.num_tokens = math.prod(tokens_spatial_shape) + self.is_mae = is_mae + + self.patch_embeddings = HieraPatchEmbeddings(config, is_mae=is_mae) + + self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.embed_dim)) + + def interpolate_pos_encoding( + self, embeddings: torch.Tensor, pos_embeds: torch.Tensor, height: int, width: int + ) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing, no class embeddings, and different patch strides. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] + num_positions = pos_embeds.shape[1] + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return pos_embeds + + dim = embeddings.shape[-1] + + new_height = height // self.patch_stride[0] + new_width = width // self.patch_stride[1] + + sqrt_num_positions = torch_int(num_positions**0.5) + pos_embeds = pos_embeds.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + pos_embeds = pos_embeds.permute(0, 3, 1, 2) + + pos_embeds = nn.functional.interpolate( + pos_embeds, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + pos_embeds = pos_embeds.permute(0, 2, 3, 1).view(1, -1, dim) + return pos_embeds + + def get_position_embedding( + self, embeddings: torch.Tensor, height: int, width: int, interpolate_pos_encoding: bool + ) -> torch.FloatTensor: + return ( + self.interpolate_pos_encoding(embeddings, self.position_embeddings, height, width) + if interpolate_pos_encoding + else self.position_embeddings + ) + + def forward( + self, + pixel_values: torch.FloatTensor, + noise: Optional[torch.FloatTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]: + height, width = pixel_values.shape[-2:] + embeddings, bool_masked_pos, ids_restore = self.patch_embeddings(pixel_values, noise=noise) + embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding) + return embeddings, bool_masked_pos, ids_restore + + +class HieraMaskUnitAttention(nn.Module): + """ + Computes either Mask Unit or Global Attention. Also is able to perform query pooling. + + Note: this assumes the tokens have already been flattened and unrolled into mask units. + """ + + def __init__( + self, + hidden_size: int, + hidden_size_output: int, + num_heads: int, + query_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ) -> None: + super().__init__() + self.num_heads = num_heads + self.query_stride = query_stride + self.hidden_size_output = hidden_size_output + + self.head_dim = hidden_size_output // num_heads + self.scale = (self.head_dim) ** -0.5 + + self.qkv = nn.Linear(hidden_size, 3 * hidden_size_output) + self.proj = nn.Linear(hidden_size_output, hidden_size_output) + + self.window_size = window_size + self.use_mask_unit_attn = use_mask_unit_attn + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input should be of shape [batch, tokens, channels].""" + batch_size, seq_len, _ = hidden_states.shape + + num_windows = 1 + if self.use_mask_unit_attn: + num_windows = seq_len // (self.query_stride * self.window_size) + + qkv = self.qkv(hidden_states) + qkv = qkv.reshape(batch_size, -1, num_windows, 3, self.num_heads, self.head_dim) + qkv = qkv.permute(3, 0, 4, 2, 1, 5) + + query, key, value = qkv.unbind(0) + + if self.query_stride > 1: + # Refer to unroll to see how this performs a maxpool-Nd + query = query.view(batch_size, self.num_heads, num_windows, self.query_stride, -1, self.head_dim) + query = query.max(dim=3).values + + attn_weights = (query * self.scale) @ key.transpose(-1, -2) + attn_weights = attn_weights.softmax(dim=-1) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = attn_weights @ value + attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.hidden_size_output) + attn_output = self.proj(attn_output) + + return (attn_output, attn_weights) if output_attentions else (attn_output, None) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Hiera +class HieraDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class HieraMlp(nn.Module): + def __init__(self, config, dim: int) -> None: + super().__init__() + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(dim, int(dim * config.mlp_ratio)) + self.fc2 = nn.Linear(int(dim * config.mlp_ratio), dim) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class HieraLayer(nn.Module): + def __init__( + self, + config, + hidden_size: int, + hidden_size_output: int, + num_heads: int, + drop_path: float = 0.0, + query_stride: int = 1, + window_size: int = 0, + use_mask_unit_attn: bool = False, + ) -> None: + super().__init__() + + self.hidden_size = hidden_size + self.hidden_size_output = hidden_size_output + self.query_stride = query_stride + + self.layernorm_before = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps) + self.attn = HieraMaskUnitAttention( + hidden_size=hidden_size, + hidden_size_output=hidden_size_output, + num_heads=num_heads, + query_stride=query_stride, + window_size=window_size, + use_mask_unit_attn=use_mask_unit_attn, + ) + + self.layernorm_after = nn.LayerNorm(hidden_size_output, eps=config.layer_norm_eps) + self.mlp = HieraMlp(config, hidden_size_output) + + self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity() + if hidden_size != hidden_size_output: + self.proj = nn.Linear(hidden_size, hidden_size_output) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size, seq_len, _ = hidden_states.shape + # Attention + Q Pooling + hidden_states_norm = self.layernorm_before(hidden_states) + if self.hidden_size != self.hidden_size_output: + hidden_states = self.proj(hidden_states_norm) + # Refer to unroll to see how this performs a maxpool-Nd + hidden_states = ( + hidden_states.view(batch_size, self.query_stride, -1, self.hidden_size_output).max(dim=1).values + ) + + (hidden_states_norm, attn_weights) = self.attn( + hidden_states_norm, head_mask, output_attentions=output_attentions + ) + hidden_states = hidden_states + self.drop_path(hidden_states_norm) + + residual = hidden_states + hidden_states = self.layernorm_after(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.drop_path(hidden_states) + + return (hidden_states, attn_weights) + + +class HieraStage(GradientCheckpointingLayer): + def __init__( + self, + config, + depth: int, + hidden_size: int, + hidden_size_output: int, + num_heads: int, + drop_path: list[float], + query_stride: list[int], + window_size: int, + use_mask_unit_attn: bool, + stage_num: Optional[int] = None, + ) -> None: + super().__init__() + # we need to know if the previous stage used masked attention + # mask unit or global attention. + # lag by 1 layer, so that global attention, + # applied post pooling on lower resolution + previous_stage_used_masked_attention = False + if stage_num is not None: + previous_stage_used_masked_attention = config.masked_unit_attention[stage_num - 1 if stage_num > 0 else 0] + self.layers = nn.ModuleList( + [ + HieraLayer( + config=config, + hidden_size=hidden_size if i == 0 else hidden_size_output, + hidden_size_output=hidden_size_output, + num_heads=num_heads, + drop_path=drop_path[i], + query_stride=query_stride[i], + window_size=window_size, + use_mask_unit_attn=use_mask_unit_attn or (previous_stage_used_masked_attention and i == 0), + ) + for i in range(depth) + ] + ) + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.FloatTensor], output_attentions: bool = False + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + for i, layer_module in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + (hidden_states, attn_weights) = layer_module( + hidden_states, layer_head_mask, output_attentions=output_attentions + ) + + return hidden_states, attn_weights + + +def undo_windowing(hidden_states: torch.Tensor, shape: list[int], mask_unit_shape: list[int]) -> torch.Tensor: + """ + Restore spatial organization by undoing windowed organization of mask units. + + Args: + hidden_states (`torch.Tensor`): The hidden states tensor of shape `[batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]`. + shape (`list[int]`): The original shape of the hidden states tensor before windowing. + mask_unit_shape (`list[int]`): The shape of the mask units used for windowing. + + Returns: + torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size]. + """ + batch_size, hidden_size = hidden_states.shape[0], hidden_states.shape[-1] + # From: [batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size] + # To: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size] + num_mask_units = [s // mu for s, mu in zip(shape, mask_unit_shape)] + hidden_states = hidden_states.view(batch_size, *num_mask_units, *mask_unit_shape, hidden_size) + + # From: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size] + # To: [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size] + hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5) + hidden_states = hidden_states.reshape(batch_size, *shape, hidden_size) + + return hidden_states + + +class HieraEncoder(nn.Module): + def __init__(self, config: HieraConfig) -> None: + super().__init__() + total_depth = sum(config.depths) + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, total_depth, device="cpu")] + # query strides rule + cumulative_depths = torch.tensor(config.depths, device="cpu").cumsum(0).tolist() + query_pool_layer = cumulative_depths[: config.num_query_pool] + query_strides = [math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(total_depth)] + + # Transformer blocks + self.stages = nn.ModuleList() + hidden_size = config.embed_dim + stage_ends = [0] + cumulative_depths + masked_unit_area = math.prod(config.masked_unit_size) + query_stride_area = math.prod(config.query_stride) + for idx_stage, depth in enumerate(config.depths): + hidden_size_output = int(config.embed_dim * config.embed_dim_multiplier**idx_stage) + + stage = HieraStage( + config=config, + depth=depth, + hidden_size=hidden_size, + hidden_size_output=hidden_size_output, + num_heads=config.num_heads[idx_stage], + drop_path=dpr[stage_ends[idx_stage] : stage_ends[idx_stage + 1]], + query_stride=query_strides[stage_ends[idx_stage] : stage_ends[idx_stage + 1]], + window_size=int(masked_unit_area * query_stride_area**-idx_stage), + use_mask_unit_attn=config.masked_unit_attention[idx_stage], + stage_num=idx_stage, + ) + + hidden_size = hidden_size_output + self.stages.append(stage) + + # Setting reroll schedule + # The first stage has to reverse everything + # The next stage has to reverse all but the first unroll, etc. + stage_size = [i // s for i, s in zip(config.image_size, config.patch_stride)] + unroll_schedule = [config.query_stride] * len(config.depths[:-1]) + + self.schedule = {} + for idx_stage in range(len(config.depths)): + self.schedule[idx_stage] = unroll_schedule, stage_size + if idx_stage < config.num_query_pool: + stage_size = [i // s for i, s in zip(stage_size, config.query_stride)] + unroll_schedule = unroll_schedule[1:] + + self.gradient_checkpointing = False + + def reroll( + self, hidden_states: torch.Tensor, stage_idx: int, bool_masked_pos: Optional[torch.BoolTensor] = None + ) -> torch.Tensor: + """ + Roll the given tensor back up to spatial order assuming it's from the given block. + + If no bool_masked_pos is provided returns: + - [batch_size, height, width, hidden_size] + If a bool_masked_pos is provided returns: + - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size] + """ + schedule, size = self.schedule[stage_idx] + batch_size, seq_len, hidden_size = hidden_states.shape + + num_dim = len(size) + mask_unit_shape = [1] * num_dim + + for strides in schedule: + # Extract the current patch from seq_len + hidden_states = hidden_states.view( + batch_size, *strides, seq_len // math.prod(strides), *mask_unit_shape, hidden_size + ) + + # Move that patch into the current MU + # Input: [batch_size, stride, stride, seq_len//(stride*stride), mask_unit_height, mask_unit_width, hidden_size] + # Output: [batch_size, seq_len//(stride*stride), stride, mask_unit_height, stride, mask_unit_width, hidden_size] + hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5, 6) + + # Reshape to [batch_size, seq_len//(stride*stride), *mask_units, hidden_size] + for i in range(num_dim): + mask_unit_shape[i] *= strides[i] + hidden_states = hidden_states.reshape(batch_size, -1, *mask_unit_shape, hidden_size) + seq_len = hidden_states.shape[1] + + # Current shape (e.g., 2d: [batch_size, #num_mask_units_height*#num_mask_units_width, mask_unit_height, mask_unit_width, hidden_size]) + hidden_states = hidden_states.view(batch_size, seq_len, *mask_unit_shape, hidden_size) + + # If masked, return [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size] + if bool_masked_pos is not None: + return hidden_states + + # If not masked, we can return [batch_size, height, width, hidden_size] + hidden_states = undo_windowing(hidden_states, size, mask_unit_shape) + + return hidden_states + + def forward( + self, + hidden_states: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, bool_masked_pos=bool_masked_pos) + all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,) + + for i, stage_module in enumerate(self.stages): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = stage_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, bool_masked_pos=bool_masked_pos) + all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attentions, all_reshaped_hidden_states] + if v is not None + ) + return HieraEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +def unroll( + hidden_states: torch.Tensor, image_shape: tuple[int, int], patch_stride: tuple[int, int], schedule: list[list[int]] +) -> torch.Tensor: + """ + Reorders the tokens such that patches are contiguous in memory. + E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as + [batch_size, (stride, stride, height // stride, width // stride), hidden_size] + + This allows operations like Max2d to be computed as x.view(batch_size, stride*stride, -1, hidden_size).max(dim=1). + Not only is this faster, but it also makes it easy to support inputs of arbitrary + dimensions in addition to patch-wise sparsity. + + Performing this operation multiple times in sequence puts entire windows as contiguous + in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of + size 8x8 would be contiguous in memory, allowing operations like mask unit attention + computed easily and efficiently, while also allowing max to be applied sequentially. + + Note: This means that intermediate values of the model are not in height x width order, so they + need to be re-rolled if you want to use the intermediate values as a height x width feature map. + The last block of the network is fine though, since by then the strides are all consumed. + """ + batch_size, _, hidden_size = hidden_states.shape + + size = [i // s for i, s in zip(image_shape, patch_stride)] + + current_size = size + hidden_states = hidden_states.view(*([batch_size] + current_size + [hidden_size])) + + for strides in schedule: + # Move patches with the given strides to the batch dimension + + # Create a view of the tensor with the patch stride as separate dims + # For example in 2d: [batch_size, height // stride, stride, width // stride, stride, C] + current_size = [i // s for i, s in zip(current_size, strides)] + # initialize new_shape with [height // stride, stride, width // stride, stride] + new_shape = [item for pair in zip(current_size, strides) for item in pair] + # add batch_size and hidden_size to new_shape + new_shape = [batch_size] + new_shape + [hidden_size] + hidden_states = hidden_states.view(new_shape) + + # Move the patch stride into the batch dimension + # For example in 2d: [batch_size, stride, stride, height // stride, width // stride, hidden_size] + num_dims = len(new_shape) + permute = [0] + list(range(2, num_dims - 1, 2)) + list(range(1, num_dims - 1, 2)) + [num_dims - 1] + hidden_states = hidden_states.permute(permute) + + # Now finally flatten the relevant dims into the batch dimension + hidden_states = hidden_states.flatten(0, len(strides)) + batch_size *= math.prod(strides) + + hidden_states = hidden_states.reshape(-1, math.prod(size), hidden_size) + return hidden_states + + +@auto_docstring +class HieraPreTrainedModel(PreTrainedModel): + config: HieraConfig + base_model_prefix = "hiera" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module) -> None: + """Initialize the weights""" + std = self.config.initializer_range + + if isinstance(module, HieraEmbeddings): + nn.init.trunc_normal_(module.position_embeddings, std=std) + + elif isinstance(module, HieraDecoder): + nn.init.trunc_normal_(module.mask_token, std=std) + nn.init.trunc_normal_(module.decoder_position_embeddings, std=std) + + elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)): + nn.init.trunc_normal_(module.weight, std=std) + if module.bias is not None: + nn.init.constant_(module.bias, std) + + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, std) + nn.init.constant_(module.weight, self.config.layer_norm_init) + + +class HieraPooler(nn.Module): + def __init__(self, config: HieraConfig): + super().__init__() + num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1)) + self.layernorm = nn.LayerNorm(num_features, eps=config.layer_norm_eps) + self.pooler = nn.AdaptiveAvgPool1d(1) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = hidden_states.transpose(1, 2) + pooled_output = self.pooler(hidden_states) + pooled_output = torch.flatten(pooled_output, 1) + pooled_output = self.layernorm(pooled_output) + return pooled_output + + +@auto_docstring +class HieraModel(HieraPreTrainedModel): + def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, is_mae: bool = False): + r""" + add_pooling_layer (`bool`, *optional*, defaults to `True`): + Whether or not to apply pooling layer. + is_mae (`bool`, *optional*, defaults to `False`): + Whether or not to run the model on MAE mode. + """ + super().__init__(config) + self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1)) + + self.embeddings = HieraEmbeddings(config, is_mae=is_mae) + self.encoder = HieraEncoder(config) + + self.unroll_schedule = [config.query_stride] * len(config.depths[:-1]) + + self.pooler = HieraPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> HieraPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + noise: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*): + Mainly used for testing purposes to control randomness and maintain the reproducibility + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, len(self.config.depths)) + + embedding_output, bool_masked_pos, ids_restore = self.embeddings( + pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, noise=noise + ) + + image_shape = (pixel_values.shape[-2], pixel_values.shape[-1]) + hidden_states = unroll( + embedding_output, + image_shape=image_shape, + patch_stride=self.config.patch_stride, + schedule=self.unroll_schedule, + ) + + # Discard masked tokens if bool_masked_pos is provided + if bool_masked_pos is not None: + mask_unit_area = math.prod(self.config.masked_unit_size) + batch_size, _, hidden_size = hidden_states.shape + positions = bool_masked_pos.unsqueeze(-1).tile(1, mask_unit_area, hidden_size) + hidden_states = hidden_states[positions] + hidden_states = hidden_states.view(batch_size, -1, hidden_size) + + encoder_outputs = self.encoder( + hidden_states, + bool_masked_pos=bool_masked_pos, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output) + + if not return_dict: + head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,) + head_outputs = ( + head_outputs + (bool_masked_pos, ids_restore) if bool_masked_pos is not None else head_outputs + ) + return head_outputs + encoder_outputs[1:] + + return HieraModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + bool_masked_pos=bool_masked_pos, + ids_restore=ids_restore, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + + +class HieraDecoder(nn.Module): + def __init__(self, config: HieraConfig): + super().__init__() + num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1)) + tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)] + self.tokens_spatial_shape_final = [ + i // s ** (config.num_query_pool) for i, s in zip(tokens_spatial_shape, config.query_stride) + ] + self.mask_unit_spatial_shape_final = [ + i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride) + ] + + self.decoder_embeddings = nn.Linear(num_features, config.decoder_hidden_size) + + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size)) + + self.decoder_position_embeddings = nn.Parameter( + torch.zeros(1, math.prod(self.tokens_spatial_shape_final), config.decoder_hidden_size) + ) + + self.decoder_block = HieraStage( + config=config, + hidden_size=config.decoder_hidden_size, + hidden_size_output=config.decoder_hidden_size, + num_heads=config.decoder_num_heads, + depth=config.decoder_depth, + use_mask_unit_attn=False, + drop_path=[0.0] * config.decoder_depth, + query_stride=[1] * config.decoder_depth, + window_size=0, + ) + + self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps) + + # patch stride of prediction + self.pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool) + pred_dim = (self.pred_stride ** len(config.query_stride)) * config.num_channels + + self.decoder_pred = nn.Linear(config.decoder_hidden_size, pred_dim) + + def forward( + self, + encoder_hidden_states: torch.Tensor, + bool_masked_pos: torch.BoolTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, torch.BoolTensor]: + # Embed tokens + hidden_states = self.decoder_embeddings(encoder_hidden_states) + + # Combine visible and bool_masked_pos tokens + + # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_hidden_size] + # bool_masked_pos: [batch_size, num_mask_units] + mask_unit_height, mask_unit_width, decoder_hidden_size = hidden_states.shape[2:] + batch_size, num_mask_units = bool_masked_pos.shape + + decoder_hidden_states = torch.zeros( + batch_size, + num_mask_units, + mask_unit_height, + mask_unit_width, + decoder_hidden_size, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + mask_tokens = self.mask_token.view(1, 1, 1, 1, -1) + bool_masked_pos = bool_masked_pos.reshape(batch_size, num_mask_units, 1, 1, 1) + bool_masked_pos = bool_masked_pos.expand(-1, -1, mask_unit_height, mask_unit_width, decoder_hidden_size) + decoder_hidden_states[bool_masked_pos] = hidden_states.flatten() + decoder_hidden_states = ( + 1 - bool_masked_pos.float() + ) * mask_tokens + bool_masked_pos.float() * decoder_hidden_states + + # Get back spatial order + hidden_states = undo_windowing( + decoder_hidden_states, + self.tokens_spatial_shape_final, + self.mask_unit_spatial_shape_final, + ) + bool_masked_pos = undo_windowing( + bool_masked_pos[..., 0:1], + self.tokens_spatial_shape_final, + self.mask_unit_spatial_shape_final, + ) + + # Flatten + hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1]) + bool_masked_pos = bool_masked_pos.view(hidden_states.shape[0], -1) + + # Add pos embed + hidden_states = hidden_states + self.decoder_position_embeddings + + # Apply decoder blocks + hidden_states, attn_weights = self.decoder_block( + hidden_states, head_mask=head_mask, output_attentions=output_attentions + ) + hidden_states = self.decoder_norm(hidden_states) + + # Predictor projection + hidden_states = self.decoder_pred(hidden_states) + + return hidden_states, bool_masked_pos + + +class HieraMultiScaleHead(nn.Module): + def __init__(self, config: HieraConfig): + super().__init__() + self.mask_unit_spatial_shape_final = [ + i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride) + ] + self.stage_dimensions = [ + int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths)) + ] + current_masked_unit_size = config.masked_unit_size + self.multi_scale_fusion_heads = nn.ModuleList() + + for idx in range(config.num_query_pool): + kernel = [i // s for i, s in zip(current_masked_unit_size, self.mask_unit_spatial_shape_final)] + current_masked_unit_size = [i // s for i, s in zip(current_masked_unit_size, config.query_stride)] + self.multi_scale_fusion_heads.append( + nn.Conv2d( + self.stage_dimensions[idx], + self.stage_dimensions[-1], + kernel_size=kernel, + stride=kernel, + ) + ) + self.multi_scale_fusion_heads.append(nn.Identity()) + + def apply_fusion_head(self, head: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor: + if isinstance(head, nn.Identity): + return hidden_states + + # Doing explicit to avoid problems with torch.fx + batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size = hidden_states.shape + # From: [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size] + # To: head([batch_size * num_mask_units, hidden_size, mask_unit_height, mask_unit_width]) + hidden_states = hidden_states.reshape( + batch_size * num_mask_units, mask_unit_height, mask_unit_width, hidden_size + ) + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = head(hidden_states) + + # Restore original layout + hidden_states = hidden_states.permute(0, 2, 3, 1) + mask_unit_height_final, mask_unit_width_final, hidden_size = hidden_states.shape[1:] + hidden_states = hidden_states.reshape( + batch_size, num_mask_units, mask_unit_height_final, mask_unit_width_final, hidden_size + ) + + return hidden_states + + def forward(self, feature_maps: list[torch.Tensor]) -> torch.Tensor: + # Multi-scale fusion + hidden_states = 0.0 + for head, feature_map in zip(self.multi_scale_fusion_heads, feature_maps): + hidden_states = hidden_states + self.apply_fusion_head(head, feature_map) + + return hidden_states + + +@auto_docstring( + custom_intro=""" + The Hiera Model transformer with the decoder on top for self-supervised pre-training. + + + + Note that we provide a script to pre-train this model on custom data in our [examples + directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining). + + + """ +) +class HieraForPreTraining(HieraPreTrainedModel): + def __init__(self, config: HieraConfig) -> None: + super().__init__(config) + # Encoder + self.hiera = HieraModel(config, add_pooling_layer=False, is_mae=True) + self.encoder_norm = nn.LayerNorm(self.hiera.num_features, eps=config.layer_norm_eps) + # Multi-scale fusion heads + self.multiscale_fusion = HieraMultiScaleHead(config) + # Decoder + self.decoder = HieraDecoder(config) + self.pred_stride = self.decoder.pred_stride + + # Initialize weights and apply final processing + self.post_init() + + def get_pixel_label_2d(self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor) -> torch.Tensor: + # bool_masked_pos (boolean tensor): True means *masked* + pixel_values = pixel_values.permute(0, 2, 3, 1) + + size = self.pred_stride + label = pixel_values.unfold(1, size, size).unfold(2, size, size) + label = label.flatten(1, 2).flatten(2) + label = label[bool_masked_pos] + if self.config.normalize_pixel_loss: + mean = label.mean(dim=-1, keepdim=True) + var = label.var(dim=-1, keepdim=True) + label = (label - mean) / (var + 1.0e-6) ** 0.5 + + return label + + def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, bool_masked_pos: torch.BoolTensor): + # We invert the bool_masked_pos such that 1.0 is *masked* + bool_masked_pos = ~bool_masked_pos + label = self.get_pixel_label_2d(pixel_values, bool_masked_pos) + + logits = logits[bool_masked_pos] + loss = (logits - label) ** 2 + loss = loss.mean() + + return loss + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + noise: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, HieraForPreTrainingOutput]: + r""" + noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*): + Mainly used for testing purposes to control randomness and maintain the reproducibility + + Examples: + ```python + >>> from transformers import AutoImageProcessor, HieraForPreTraining + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-mae-hf") + >>> model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> logits = outputs.logits + >>> loss = outputs.loss + >>> print(list(logits.shape)) + [1, 196, 768] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.hiera( + pixel_values, + noise=noise, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=True, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + feature_maps = outputs[-1] + bool_masked_pos = outputs[1] + ids_to_restore = outputs[2] + # Take only the query pooled and last hidden states + feature_maps = feature_maps[1 : self.hiera.config.num_query_pool + 1] + (feature_maps[-1],) + fused_hidden_states = self.multiscale_fusion(feature_maps) + fused_hidden_states = self.encoder_norm(fused_hidden_states) + + # Reconstruct pixel values + logits, bool_masked_pos = self.decoder( + fused_hidden_states, + bool_masked_pos=bool_masked_pos, + head_mask=head_mask, + output_attentions=output_attentions, + ) + + loss = self.forward_loss(pixel_values, logits, bool_masked_pos) + + if not return_dict: + output = (logits, bool_masked_pos, ids_to_restore) + if output_hidden_states: + output = output + (outputs[3],) + if output_attentions: + output = output + (outputs[4],) + if output_hidden_states: + output = output + (outputs[-1],) + return ((loss,) + output) if loss is not None else output + + return HieraForPreTrainingOutput( + loss=loss, + logits=logits, + bool_masked_pos=bool_masked_pos, + ids_restore=ids_to_restore, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states if output_hidden_states else None, + ) + + +@auto_docstring( + custom_intro=""" + Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state with + average pooling) e.g. for ImageNet. + + + + Note that it's possible to fine-tune Hiera on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """ +) +class HieraForImageClassification(HieraPreTrainedModel): + def __init__(self, config: HieraConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.hiera = HieraModel(config, add_pooling_layer=True, is_mae=False) + + # Classifier head + self.classifier = ( + nn.Linear(self.hiera.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, HieraForImageClassificationOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.hiera( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return HieraForImageClassificationOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + Hiera backbone, to be used with frameworks like DETR and MaskFormer. + """ +) +class HieraBackbone(HieraPreTrainedModel, BackboneMixin): + def __init__(self, config: HieraConfig): + super().__init__(config) + super()._init_backbone(config) + + self.num_features = [config.embed_dim] + [ + int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths)) + ] + self.embeddings = HieraEmbeddings(config, is_mae=False) + self.encoder = HieraEncoder(config) + + # Add layer norms to hidden states of out_features + hidden_states_norms = {} + for stage, num_channels in zip(self._out_features, self.channels): + hidden_states_norms[stage] = nn.LayerNorm(num_channels) + self.hidden_states_norms = nn.ModuleDict(hidden_states_norms) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-hf") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/hiera-tiny-224-hf", out_features=["stage1", "stage2", "stage3", "stage4"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 7, 7] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output, _, _ = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, + head_mask=None, + output_attentions=output_attentions, + output_hidden_states=True, + return_dict=return_dict, + ) + + hidden_states = outputs[-1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + batch_size, height, width, num_channels = hidden_state.shape + hidden_state = hidden_state.view(batch_size, height * width, num_channels) + hidden_state = self.hidden_states_norms[stage](hidden_state) + hidden_state = hidden_state.view(batch_size, height, width, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) + + if not return_dict: + output = (feature_maps,) + if output_hidden_states: + output += (outputs[1],) + if output_attentions: + output += (outputs[2],) + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs[1] if output_hidden_states else None, + attentions=outputs[2] if output_attentions else None, + ) + + +__all__ = ["HieraForImageClassification", "HieraForPreTraining", "HieraBackbone", "HieraModel", "HieraPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcb18edb8e1d5475ea5c043da2da32aae492cc17 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ab9ce8301fb802060f302b4f299e31a9e9aca24 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f93b20e574add0aa57b5653088a798e79c361eb8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3573ce73b103545e61b4cd0e620aeb11245b96e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee8613086ddb94a330a963bc4b8abfac8ddd99db Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..735df04f3d1cb535c27dd77e995939ec10e0f72e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..456198490d324ea8964879648e2d51e53047ee68 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e7d75b63b704f42cfc0ae5b7ea2597d7cc37c93 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dceea97da189f2b0d6a88c5fbb9a44af39dff10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cd107ee7a3c16d2527806035f67702e9220cae51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py @@ -0,0 +1,14 @@ +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_hunyuan_v1_moe import * + from .modeling_hunyuan import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..386ddac1d3ebb37de330a3940ad5ac556be5bcf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""HunYuanMoEV1 model configuration""" + +from typing import Union + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class HunYuanMoEV1Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`HunYuanMoEV1Model`]. It is used to instantiate an + HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the HunYuan-7B. + Hunyuan-A13B-Instruct [tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 290943): + Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`HunYuanMoEV1Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations or shared MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + eod_token_id (int, *optional*, defaults to 3): + Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence. + Example: In multi-document processing, this token helps the model distinguish between separate documents. + sep_token_id (`int`, *optional*, defaults to 4): + Token ID representing the separator token (`[SEP]`), used to demarcate boundaries between different text segments. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + num_experts (`int` or `List`, *optional*, defaults to 1): + The number of experts for moe. If it is a list, it will be used as the number of experts for each layer. + moe_topk (int or List, *optional*, defaults to 1): + Number of experts selected per token (Top-K routing). List form enables layer-wise customization. + head_dim (`int`, *optional*, defaults to 128): + The attention head dimension. + """ + + model_type = "hunyuan_v1_moe" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=290943, + hidden_size=4096, + intermediate_size: int = 11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + eod_token_id=3, + sep_token_id=4, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + num_experts: Union[int, list] = 1, + moe_topk: Union[int, list] = 1, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_experts = num_experts + self.moe_topk = moe_topk + + self.head_dim = head_dim + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + # self._rope_scaling_validation() # TODO: Need validation? + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + rope_scaling_alpha = self.rope_scaling.get("alpha", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None and rope_scaling_alpha is None: + raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none") + if rope_scaling_factor is not None: + if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}") + if rope_scaling_alpha is not None: + if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0: + raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}") + + +__all__ = ["HunYuanMoEV1Config"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..043d1f8243a35638747f3ff44946748dc472fcf1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py @@ -0,0 +1,584 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_hunyuan_v1_moe.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from transformers.cache_utils import Cache + +from ...activations import ACT2FN +from ...cache_utils import DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config + + +@use_kernel_forward_from_hub("RMSNorm") +class HunYuanMoEV1RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + HunYuanMoEV1RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class HunYuanMoEV1MLP(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + self.layer_idx = layer_idx + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class HunYuanMoEV1Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + query_states = self.query_layernorm(query_states) + key_states = self.key_layernorm(key_states) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class HunYuanMoEV1Gate(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32) + + def forward(self, hidden_states): + bsz, seq_len, hidden_size = hidden_states.shape + hidden_states = hidden_states.reshape(-1, hidden_size) + if self.wg.weight.dtype == torch.float32: + hidden_states = hidden_states.float() + logits = self.wg(hidden_states) + return logits + + +class HunYuanMoEV1Moe(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx] + self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx) + # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32) + self.experts = nn.ModuleList( + [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)] + ) + + self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states_mlp = self.shared_mlp(hidden_states) + router_logits = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit: + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0)) + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + + # However `index_add_` only support torch tensors for indexing so we'll use + # the `top_x` tensor here. + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states + hidden_states_mlp + + +class HunYuanMoEV1DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx) + self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx) + self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class HunYuanMoEV1PreTrainedModel(PreTrainedModel): + config: HunYuanMoEV1Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["HunYuanMoEV1DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = False + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": HunYuanMoEV1DecoderLayer, + "attentions": HunYuanMoEV1Attention, + } + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class HunYuanMoEV1RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: HunYuanMoEV1Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: + # DynamicNTKAlphaRotary + self.dim = config.head_dim + base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.attention_scaling = 1.0 + else: + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class HunYuanMoEV1Model(HunYuanMoEV1PreTrainedModel): + def __init__(self, config: HunYuanMoEV1Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [HunYuanMoEV1DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = HunYuanMoEV1RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class HunYuanMoEV1ForCausalLM(HunYuanMoEV1PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = HunYuanMoEV1Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, HunYuanMoEV1ForCausalLM + + >>> model = HunYuanMoEV1ForCausalLM.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class HunYuanMoEV1ForSequenceClassification(GenericForSequenceClassification, HunYuanMoEV1PreTrainedModel): + pass + + +__all__ = [ + "HunYuanMoEV1ForCausalLM", + "HunYuanMoEV1Model", + "HunYuanMoEV1PreTrainedModel", + "HunYuanMoEV1ForSequenceClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..a72d6268fe7021b472a141beee79ea3eadca5c84 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch HunYuanMoEV1 model.""" + +from typing import Callable, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from transformers.cache_utils import Cache +from transformers.utils import ( + logging, +) + +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaForSequenceClassification, + LlamaMLP, + LlamaModel, + LlamaPreTrainedModel, + LlamaRMSNorm, + apply_rotary_pos_emb, + eager_attention_forward, +) +from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config + + +logger = logging.get_logger(__name__) + + +class HunYuanMoEV1RMSNorm(LlamaRMSNorm): + pass + + +class HunYuanMoEV1MLP(LlamaMLP): + def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False): + super().__init__(config) + self.layer_idx = layer_idx + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + +class HunYuanMoEV1Attention(LlamaAttention): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__(config, layer_idx) + self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + query_states = self.query_layernorm(query_states) + key_states = self.key_layernorm(key_states) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class HunYuanMoEV1Gate(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32) + + def forward(self, hidden_states): + bsz, seq_len, hidden_size = hidden_states.shape + hidden_states = hidden_states.reshape(-1, hidden_size) + if self.wg.weight.dtype == torch.float32: + hidden_states = hidden_states.float() + logits = self.wg(hidden_states) + return logits + + +class HunYuanMoEV1Moe(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx] + self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx) + # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32) + self.experts = nn.ModuleList( + [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)] + ) + + self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states_mlp = self.shared_mlp(hidden_states) + router_logits = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit: + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0)) + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + + # However `index_add_` only support torch tensors for indexing so we'll use + # the `top_x` tensor here. + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states + hidden_states_mlp + + +class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__(config, layer_idx) + self.hidden_size = config.hidden_size + self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx) + self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx) + self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.layer_idx = layer_idx + + +class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel): + _can_compile_fullgraph = False + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class HunYuanMoEV1RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: HunYuanMoEV1Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: + # DynamicNTKAlphaRotary + self.dim = config.head_dim + base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.attention_scaling = 1.0 + else: + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class HunYuanMoEV1Model(LlamaModel): + pass + + +class HunYuanMoEV1ForCausalLM(LlamaForCausalLM): + pass + + +class HunYuanMoEV1ForSequenceClassification(LlamaForSequenceClassification): + pass + + +__all__ = [ + "HunYuanMoEV1ForCausalLM", + "HunYuanMoEV1Model", + "HunYuanMoEV1PreTrainedModel", + "HunYuanMoEV1ForSequenceClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..143013a133fc20073555e166a94228e2241f25a1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..981c0618c278bd8d1d23f62988d920ca507b7c94 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..323499ede03b53efbd99cd3baf5d3af24dc89c70 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aad95d179b851666680cec2dd310b8b6e53a005a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb2c0ea9b0ee43736ca176f79bec3c6ead26a3ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb42eb4ec0f44b66bdb3ae7dfe50b83ce815ee6a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4c72bdff14fe118db3819d187d50318b6fde18c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54660b51492032e42f517fbcfb8af655a089547b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e67c74777e4f05b50ff3a3cdb473c4f193f702c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aa9c84903da03b786a12576b074d3490d222fd6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..098ffb6296f547e6dd9f1f990d21e28bc5cb0f7b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_imagegpt import * + from .feature_extraction_imagegpt import * + from .image_processing_imagegpt import * + from .image_processing_imagegpt_fast import * + from .modeling_imagegpt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfa8d5e47826e4323f74b33430fa872005c88ea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py @@ -0,0 +1,200 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""OpenAI ImageGPT configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +if TYPE_CHECKING: + from ... import FeatureExtractionMixin, TensorType + +logger = logging.get_logger(__name__) + + +class ImageGPTConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`ImageGPTModel`] or a [`TFImageGPTModel`]. It is + used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the ImageGPT + [openai/imagegpt-small](https://huggingface.co/openai/imagegpt-small) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 512): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ImageGPTModel`] or [`TFImageGPTModel`]. + n_positions (`int`, *optional*, defaults to 32*32): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_embd (`int`, *optional*, defaults to 512): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (`int`, *optional*, defaults to None): + Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd + activation_function (`str`, *optional*, defaults to `"quick_gelu"`): + Activation function (can be one of the activation functions defined in src/transformers/activations.py). + Defaults to "quick_gelu". + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + scale_attn_weights (`bool`, *optional*, defaults to `True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights by `1 / layer_idx + 1`. + reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): + Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention + dot-product/softmax to float() when training with mixed precision. + + Example: + + ```python + >>> from transformers import ImageGPTConfig, ImageGPTModel + + >>> # Initializing a ImageGPT configuration + >>> configuration = ImageGPTConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = ImageGPTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "imagegpt" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=512 + 1, # add one for start of sentence (sos) token + n_positions=32 * 32, + n_embd=512, + n_layer=24, + n_head=8, + n_inner=None, + activation_function="quick_gelu", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + scale_attn_weights=True, + use_cache=True, + tie_word_embeddings=False, + scale_attn_by_inverse_layer_idx=False, + reorder_and_upcast_attn=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx + self.reorder_and_upcast_attn = reorder_and_upcast_attn + self.tie_word_embeddings = tie_word_embeddings + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class ImageGPTOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ] + ) + + def generate_dummy_inputs( + self, + preprocessor: "FeatureExtractionMixin", + batch_size: int = 1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional["TensorType"] = None, + num_channels: int = 3, + image_width: int = 32, + image_height: int = 32, + ) -> Mapping[str, Any]: + """ + Generate inputs to provide to the ONNX exporter for the specific framework + + Args: + preprocessor ([`PreTrainedTokenizerBase`] or [`FeatureExtractionMixin`]): + The preprocessor associated with this model configuration. + batch_size (`int`, *optional*, defaults to -1): + The batch size to export the model for (-1 means dynamic axis). + num_choices (`int`, *optional*, defaults to -1): + The number of candidate answers provided for multiple choice task (-1 means dynamic axis). + seq_length (`int`, *optional*, defaults to -1): + The sequence length to export the model for (-1 means dynamic axis). + is_pair (`bool`, *optional*, defaults to `False`): + Indicate if the input is a pair (sentence 1, sentence 2) + framework (`TensorType`, *optional*, defaults to `None`): + The framework (PyTorch or TensorFlow) that the tokenizer will generate tensors for. + num_channels (`int`, *optional*, defaults to 3): + The number of channels of the generated images. + image_width (`int`, *optional*, defaults to 40): + The width of the generated images. + image_height (`int`, *optional*, defaults to 40): + The height of the generated images. + + Returns: + Mapping[str, Tensor] holding the kwargs to provide to the model's forward function + """ + + input_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width) + inputs = dict(preprocessor(images=input_image, return_tensors=framework)) + + return inputs + + +__all__ = ["ImageGPTConfig", "ImageGPTOnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..46787f139f10a0c339e4cea9524d34c00a03ceb6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for ImageGPT.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_imagegpt import ImageGPTImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class ImageGPTFeatureExtractor(ImageGPTImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class ImageGPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use ImageGPTImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["ImageGPTFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2114509f7026d9f266be952dc220f2de1b632d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for ImageGPT.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import rescale, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging +from ...utils.import_utils import requires + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +def squared_euclidean_distance(a, b): + b = b.T + a2 = np.sum(np.square(a), axis=1) + b2 = np.sum(np.square(b), axis=0) + ab = np.matmul(a, b) + d = a2[:, None] - 2 * ab + b2[None, :] + return d + + +def color_quantize(x, clusters): + x = x.reshape(-1, 3) + d = squared_euclidean_distance(x, clusters) + return np.argmin(d, axis=1) + + +@requires(backends=("vision",)) +class ImageGPTImageProcessor(BaseImageProcessor): + r""" + Constructs a ImageGPT image processor. This image processor can be used to resize images to a smaller resolution + (such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel values" + (color clusters). + + Args: + clusters (`np.ndarray` or `list[list[int]]`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by + `do_resize` in `preprocess`. + size (`dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`): + Size of the image after resizing. Can be overridden by `size` in `preprocess`. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in + `preprocess`. + do_color_quantize (`bool`, *optional*, defaults to `True`): + Whether to color quantize the image. Can be overridden by `do_color_quantize` in `preprocess`. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor + clusters: Optional[Union[list[list[int]], np.ndarray]] = None, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_normalize: bool = True, + do_color_quantize: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 256, "width": 256} + size = get_size_dict(size) + self.clusters = np.array(clusters) if clusters is not None else None + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_normalize = do_normalize + self.do_color_quantize = do_color_quantize + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def normalize( + self, + image: np.ndarray, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Normalizes an images' pixel values to between [-1, 1]. + + Args: + image (`np.ndarray`): + Image to normalize. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + image = rescale(image=image, scale=1 / 127.5, data_format=data_format, input_data_format=input_data_format) + image = image - 1 + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_normalize: Optional[bool] = None, + do_color_quantize: Optional[bool] = None, + clusters: Optional[Union[list[list[int]], np.ndarray]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_normalize=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only + has an effect if `do_resize` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image + do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`): + Whether to color quantize the image. + clusters (`np.ndarray` or `list[list[int]]`, *optional*, defaults to `self.clusters`): + Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if + `do_color_quantize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + Only has an effect if `do_color_quantize` is set to `False`. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size) + resample = resample if resample is not None else self.resample + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize + clusters = clusters if clusters is not None else self.clusters + clusters = np.array(clusters) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + # Here, normalize() is using a constant factor to divide pixel values. + # hence, the method does not need image_mean and image_std. + validate_preprocess_arguments( + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_color_quantize and clusters is None: + raise ValueError("Clusters must be specified if do_color_quantize is True.") + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_normalize and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If you wish to do this, " + "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].", + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [self.normalize(image=image, input_data_format=input_data_format) for image in images] + + if do_color_quantize: + images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images] + # color quantize from (batch_size, height, width, 3) to (batch_size, height, width) + images = np.array(images) + images = color_quantize(images, clusters).reshape(images.shape[:-1]) + + # flatten to (batch_size, height*width) + batch_size = images.shape[0] + images = images.reshape(batch_size, -1) + + # We need to convert back to a list of images to keep consistent behaviour across processors. + images = list(images) + data = {"input_ids": images} + else: + images = [to_channel_dimension_format(image, data_format, input_data_format) for image in images] + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def to_dict(self): + output = super().to_dict() + # Ensure clusters are JSON/equality friendly + if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): + output["clusters"] = output["clusters"].tolist() + # Need to set missing keys from slow processor to match the expected behavior in save/load tests compared to fast processor + missing_keys = ["image_mean", "image_std", "rescale_factor", "do_rescale"] + for key in missing_keys: + if key in output: + output[key] = None + + return output + + +__all__ = ["ImageGPTImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..7a6bcc53ae1acaefa1f4a923f7d5b422c7a62d8e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -0,0 +1,198 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for ImageGPT.""" + +from typing import Optional, Union + +import numpy as np +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, +) +from ...image_transforms import group_images_by_shape, reorder_images +from ...image_utils import PILImageResampling +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, +) + + +def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """ + Compute squared Euclidean distances between all pixels and clusters. + + Args: + a: (N, 3) tensor of pixel RGB values + b: (M, 3) tensor of cluster RGB values + + Returns: + (N, M) tensor of squared distances + """ + b = b.t() # (3, M) + a2 = torch.sum(a**2, dim=1) # (N,) + b2 = torch.sum(b**2, dim=0) # (M,) + ab = torch.matmul(a, b) # (N, M) + d = a2[:, None] - 2 * ab + b2[None, :] # Squared Euclidean Distance: a^2 - 2ab + b^2 + return d # (N, M) tensor of squared distances + + +def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor: + """ + Assign each pixel to its nearest color cluster. + + Args: + x: (H*W, 3) tensor of flattened pixel RGB values + clusters: (n_clusters, 3) tensor of cluster RGB values + + Returns: + (H*W,) tensor of cluster indices + """ + d = squared_euclidean_distance_torch(x, clusters) + return torch.argmin(d, dim=1) + + +class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + do_color_quantize (`bool`, *optional*, defaults to `True`): + Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices. + When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling. + """ + + clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]] + do_color_quantize: Optional[bool] + + +@auto_docstring +class ImageGPTImageProcessorFast(BaseImageProcessorFast): + model_input_names = ["input_ids"] + resample = PILImageResampling.BILINEAR + do_color_quantize = True + clusters = None + image_mean = [0.5, 0.5, 0.5] + image_std = [0.5, 0.5, 0.5] + do_rescale = True + do_normalize = True + valid_kwargs = ImageGPTFastImageProcessorKwargs + + def __init__( + self, + clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, # keep as arg for backwards compatibility + **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], + ): + r""" + clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + """ + clusters = torch.as_tensor(clusters, dtype=torch.float32) if clusters is not None else None + super().__init__(clusters=clusters, **kwargs) + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: dict[str, int], + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: dict[str, int], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + do_color_quantize: Optional[bool] = None, + clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, + disable_grouping: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ): + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + pixel_values = reorder_images(processed_images_grouped, grouped_images_index) + + # If color quantization is requested, perform it; otherwise return pixel values + if do_color_quantize: + # Prepare clusters + if clusters is None: + raise ValueError("Clusters must be provided for color quantization.") + # Convert to torch tensor if needed (clusters might be passed as list/numpy) + clusters_torch = ( + torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters + ).to(pixel_values[0].device, dtype=pixel_values[0].dtype) + + # Group images by shape for batch processing + # We need to check if the pixel values are a tensor or a list of tensors + grouped_images, grouped_images_index = group_images_by_shape( + pixel_values, disable_grouping=disable_grouping + ) + # Process each group + input_ids_grouped = {} + + for shape, stacked_images in grouped_images.items(): + input_ids = color_quantize_torch( + stacked_images.permute(0, 2, 3, 1).reshape(-1, 3), clusters_torch + ) # (B*H*W, C) + input_ids_grouped[shape] = input_ids.reshape(stacked_images.shape[0], -1).reshape( + stacked_images.shape[0], -1 + ) # (B, H, W) + + input_ids = reorder_images(input_ids_grouped, grouped_images_index) + + return BatchFeature( + data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids}, + tensor_type=return_tensors, + ) + + pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values + return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) + + def to_dict(self): + # Convert torch tensors to lists for JSON serialization + output = super().to_dict() + if output.get("clusters") is not None and isinstance(output["clusters"], torch.Tensor): + output["clusters"] = output["clusters"].tolist() + + return output + + +__all__ = ["ImageGPTImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..a962141e447946633a2598b3c0bfe43b46f10e6d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py @@ -0,0 +1,1024 @@ +# coding=utf-8 +# Copyright 2021 The OpenAI Team Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI ImageGPT model.""" + +import math +import os +from typing import Any, Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + SequenceClassifierOutputWithPast, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer +from ...utils import ( + auto_docstring, + logging, + torch_float, +) +from .configuration_imagegpt import ImageGPTConfig + + +logger = logging.get_logger(__name__) + + +def load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path): + """ + Load tf checkpoints in a pytorch model + """ + try: + import re + + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(imagegpt_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split("/") + + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ) or name[-1] in ["_step"]: + logger.info("Skipping {}".format("/".join(name))) + continue + + pointer = model + if name[-1] not in ["wtet"]: + pointer = getattr(pointer, "transformer") + + for m_name in name: + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + scope_names = re.split(r"(\d+)", m_name) + else: + scope_names = [m_name] + + if scope_names[0] == "w" or scope_names[0] == "g": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "b": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "wpe" or scope_names[0] == "wte": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + elif scope_names[0] in ["q_proj", "k_proj", "v_proj"]: + pointer = getattr(pointer, "c_attn") + pointer = getattr(pointer, "weight") + elif len(name) == 3 and name[1] == "attn" and scope_names[0] == "c_proj": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + elif scope_names[0] == "wtet": + pointer = getattr(pointer, "lm_head") + pointer = getattr(pointer, "weight") + elif scope_names[0] == "sos": + pointer = getattr(pointer, "wte") + pointer = getattr(pointer, "weight") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + + if len(name) > 1 and name[1] == "attn" or name[-1] == "wtet" or name[-1] == "sos" or name[-1] == "wte": + pass # array is used to initialize only part of the pointer so sizes won't match + else: + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + + logger.info(f"Initialize PyTorch weight {name}") + + if name[-1] == "q_proj": + pointer.data[:, : config.n_embd] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T + elif name[-1] == "k_proj": + pointer.data[:, config.n_embd : 2 * config.n_embd] = torch.from_numpy( + array.reshape(config.n_embd, config.n_embd) + ).T + elif name[-1] == "v_proj": + pointer.data[:, 2 * config.n_embd :] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T + elif len(name) == 3 and name[1] == "attn" and name[2] == "c_proj": + pointer.data = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)) + elif name[-1] == "wtet": + pointer.data = torch.from_numpy(array) + elif name[-1] == "wte": + pointer.data[: config.vocab_size - 1, :] = torch.from_numpy(array) + elif name[-1] == "sos": + pointer.data[-1] = torch.from_numpy(array) + else: + pointer.data = torch.from_numpy(array) + + return model + + +class ImageGPTLayerNorm(nn.Module): + def __init__(self, hidden_size: tuple[int], eps: float = 1e-5): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.Tensor(hidden_size)) + + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + # input is not mean centered + tensor = tensor / torch.sqrt(torch.mean(torch.square(tensor), axis=-1, keepdim=True) + self.eps) + tensor = tensor * self.weight + return tensor + + +class ImageGPTAttention(nn.Module): + def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None): + super().__init__() + + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) + self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + self.scale_attn_weights = config.scale_attn_weights + self.is_cross_attention = is_cross_attention + + # Layer-wise attention scaling, reordering, and upcasting + self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx + self.layer_idx = layer_idx + self.reorder_and_upcast_attn = config.reorder_and_upcast_attn + + if self.is_cross_attention: + self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim) + self.q_attn = Conv1D(self.embed_dim, self.embed_dim) + else: + self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim) + self.c_proj = Conv1D(self.embed_dim, self.embed_dim) + + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads)) + self.num_heads = self.num_heads - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _attn(self, query, key, value, attention_mask=None, head_mask=None): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if self.scale_attn_weights: + attn_weights = attn_weights / torch_float(value.size(-1) ** 0.5) + + # Layer-wise attention scaling + if self.scale_attn_by_inverse_layer_idx: + attn_weights = attn_weights / float(self.layer_idx + 1) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.Softmax(dim=-1)(attn_weights) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None): + # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM) + bsz, num_heads, q_seq_len, dk = query.size() + _, _, k_seq_len, _ = key.size() + + # Preallocate attn_weights for `baddbmm` + attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device) + + # Compute Scale Factor + scale_factor = 1.0 + if self.scale_attn_weights: + scale_factor /= float(value.size(-1)) ** 0.5 + + if self.scale_attn_by_inverse_layer_idx: + scale_factor /= float(self.layer_idx + 1) + + # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk)) + with torch.autocast(query.device.type, enabled=False): + q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len) + attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor) + attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.Softmax(dim=-1)(attn_weights) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise + if attn_weights.dtype != torch.float32: + raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32") + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def _split_heads(self, tensor, num_heads, attn_head_size): + """ + Splits hidden_size dim into attn_head_size and num_heads + """ + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + tensor = tensor.view(*new_shape) + return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def _merge_heads(self, tensor, num_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden_size + """ + tensor = tensor.permute(0, 2, 1, 3).contiguous() + new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) + return tensor.view(new_shape) + + def forward( + self, + hidden_states: torch.Tensor, + layer_past: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple: + is_cross_attention = encoder_hidden_states is not None + bsz, seq_len, _ = hidden_states.shape + + if layer_past is not None: + if isinstance(layer_past, EncoderDecoderCache): + is_updated = layer_past.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = layer_past.cross_attention_cache + else: + curr_past_key_value = layer_past.self_attention_cache + else: + curr_past_key_value = layer_past + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`." + ) + + if layer_past is not None and is_updated: + # reuse k,v, cross_attentions, and compute only q + query = self.q_attn(hidden_states) + key = curr_past_key_value.layers[self.layer_idx].keys + value = curr_past_key_value.layers[self.layer_idx].values + else: + query = self.q_attn(hidden_states) + key, value = self.c_attn(current_states).split(self.split_size, dim=2) + key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + else: + query, key, value = self.c_attn(current_states).split(self.split_size, dim=2) + key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + + if layer_past is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key, value = curr_past_key_value.update(key, value, self.layer_idx, {"cache_position": cache_position}) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + layer_past.is_updated[self.layer_idx] = True + + query = query.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + + if self.reorder_and_upcast_attn: + attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask) + else: + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + return attn_output, attn_weights + + +class ImageGPTMLP(nn.Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = Conv1D(intermediate_size, embed_dim) + self.c_proj = Conv1D(embed_dim, intermediate_size) + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class ImageGPTBlock(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + hidden_size = config.hidden_size + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size + + self.ln_1 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = ImageGPTAttention(config, layer_idx=layer_idx) + self.ln_2 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + if config.add_cross_attention: + self.crossattention = ImageGPTAttention(config, is_cross_attention=True, layer_idx=layer_idx) + self.ln_cross_attn = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = ImageGPTMLP(inner_dim, config) + + def forward( + self, + hidden_states: torch.Tensor, + layer_past: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_outputs = self.attn( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn_output = attn_outputs[0] + outputs = attn_outputs[1:] + # residual connection + hidden_states = attn_output + residual + + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_outputs = self.crossattention( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn_output = cross_attn_outputs[0] + # residual connection + hidden_states = residual + attn_output + outputs = outputs + cross_attn_outputs[1:] # add cross attentions if we output attention weights + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + return (hidden_states,) + outputs + + +@auto_docstring +class ImageGPTPreTrainedModel(PreTrainedModel): + config: ImageGPTConfig + load_tf_weights = load_tf_weights_in_imagegpt + base_model_prefix = "transformer" + main_input_name = "input_ids" + supports_gradient_checkpointing = True + _no_split_modules = ["ImageGPTBlock"] + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, ImageGPTLayerNorm): + module.weight.data.fill_(1.0) + + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if "c_proj" in name and "weight" in name: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) + + +@auto_docstring +class ImageGPTModel(ImageGPTPreTrainedModel): + def __init__(self, config: ImageGPTConfig): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([ImageGPTBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.ln_f = ImageGPTLayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs: Any, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, ImageGPTModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small") + >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + past_length = past_key_values.get_seq_length() if past_key_values is not None else past_key_values + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + + if position_ids is None: + position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0) + + # ImageGPTAttention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device) + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + output_shape = input_shape + (hidden_states.size(-1),) + + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, block in enumerate(self.h): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + past_key_values, + attention_mask, + head_mask[i], + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + hidden_states = hidden_states.view(*output_shape) + + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: ImageGPTConfig): + super().__init__(config) + self.transformer = ImageGPTModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size - 1, bias=False) + + # Model parallel + self.model_parallel = False + self.device_map = None + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs: Any, + ) -> Union[tuple, CausalLMOutputWithCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling + >>> import torch + >>> import matplotlib.pyplot as plt + >>> import numpy as np + + >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small") + >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small") + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> # unconditional generation of 8 images + >>> batch_size = 4 + >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1) # initialize with SOS token + >>> context = context.to(device) + >>> output = model.generate( + ... input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40 + ... ) + + >>> clusters = image_processor.clusters + >>> height = image_processor.size["height"] + >>> width = image_processor.size["width"] + + >>> samples = output[:, 1:].detach().cpu().numpy() + >>> samples_img = [ + ... np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples + ... ] # convert color cluster tokens back to pixels + >>> f, axes = plt.subplots(1, batch_size, dpi=300) + + >>> for img, ax in zip(samples_img, axes): # doctest: +IGNORE_RESULT + ... ax.axis("off") + ... ax.imshow(img) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The ImageGPT Model transformer with an image classification head on top (linear layer). + [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification. + """ +) +class ImageGPTForImageClassification(ImageGPTPreTrainedModel): + def __init__(self, config: ImageGPTConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = ImageGPTModel(config) + self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Any, + ) -> Union[tuple, SequenceClassifierOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small") + >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + # average-pool the hidden states along the sequence dimension + pooled_hidden_states = hidden_states.mean(dim=1) + # project from (batch_size, hidden_size) to (batch_size, num_labels) + logits = self.score(pooled_hidden_states) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = [ + "ImageGPTForCausalImageModeling", + "ImageGPTForImageClassification", + "ImageGPTModel", + "ImageGPTPreTrainedModel", + "load_tf_weights_in_imagegpt", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9564a1ea9ec92004d96ff6d980ab0ef4386b1d33 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..448307e6dc8d00f1f1c48f624bbcdc7bd802d563 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c13bb544a33bf33c9091b6f06115dd096f279db Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..274fad9c1fa8509362f70ede73ba0dc6526805cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1760c53ec46e5f27f6b46c593873eeae56f68da7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7134e9ca6e050e71448230857587d75b7d2c2a29 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd1f85b47bad95a2adc0098a1fbdf59b3c550a75 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d63e1eb666d33cafa5098c62a0bde4d127183083 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19205bc39e25210cff274811b5ca1694c2943e17 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ea61afd214787c6c955e44bb43c4c5865be0cca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..184b7900cc80b34a312be42477729aa679074238 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3396e2a546e8ab358f521ba74af9016d23bb2165 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ea4da3ecc078eeac929c3b68bbdb03d251637aa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3745bb7c8577d6c508b66bf4f09af4f46c84c763 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd4cd5987c496f6fea693f308fdd5ede8996a7d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py new file mode 100644 index 0000000000000000000000000000000000000000..c845bb43b346a2d00a93193125f8b3d6d72c1ebc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py @@ -0,0 +1,294 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""LayoutLMv3 model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...onnx.utils import compute_effective_axis_dimension +from ...utils import logging + + +if TYPE_CHECKING: + from ...processing_utils import ProcessorMixin + from ...utils import TensorType + + +logger = logging.get_logger(__name__) + + +class LayoutLMv3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LayoutLMv3Model`]. It is used to instantiate an + LayoutLMv3 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the LayoutLMv3 + [microsoft/layoutlmv3-base](https://huggingface.co/microsoft/layoutlmv3-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the LayoutLMv3 model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`LayoutLMv3Model`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv3Model`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + max_2d_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum value that the 2D position embedding might ever be used with. Typically set this to something + large just in case (e.g., 1024). + coordinate_size (`int`, *optional*, defaults to `128`): + Dimension of the coordinate embeddings. + shape_size (`int`, *optional*, defaults to `128`): + Dimension of the width and height embeddings. + has_relative_attention_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use a relative attention bias in the self-attention mechanism. + rel_pos_bins (`int`, *optional*, defaults to 32): + The number of relative position bins to be used in the self-attention mechanism. + max_rel_pos (`int`, *optional*, defaults to 128): + The maximum number of relative positions to be used in the self-attention mechanism. + max_rel_2d_pos (`int`, *optional*, defaults to 256): + The maximum number of relative 2D positions in the self-attention mechanism. + rel_2d_pos_bins (`int`, *optional*, defaults to 64): + The number of 2D relative position bins in the self-attention mechanism. + has_spatial_attention_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use a spatial attention bias in the self-attention mechanism. + visual_embed (`bool`, *optional*, defaults to `True`): + Whether or not to add patch embeddings. + input_size (`int`, *optional*, defaults to `224`): + The size (resolution) of the images. + num_channels (`int`, *optional*, defaults to `3`): + The number of channels of the images. + patch_size (`int`, *optional*, defaults to `16`) + The size (resolution) of the patches. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import LayoutLMv3Config, LayoutLMv3Model + + >>> # Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration + >>> configuration = LayoutLMv3Config() + + >>> # Initializing a model (with random weights) from the microsoft/layoutlmv3-base style configuration + >>> model = LayoutLMv3Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "layoutlmv3" + + def __init__( + self, + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-5, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + max_2d_position_embeddings=1024, + coordinate_size=128, + shape_size=128, + has_relative_attention_bias=True, + rel_pos_bins=32, + max_rel_pos=128, + rel_2d_pos_bins=64, + max_rel_2d_pos=256, + has_spatial_attention_bias=True, + text_embed=True, + visual_embed=True, + input_size=224, + num_channels=3, + patch_size=16, + classifier_dropout=None, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + attention_probs_dropout_prob=attention_probs_dropout_prob, + max_position_embeddings=max_position_embeddings, + type_vocab_size=type_vocab_size, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + self.max_2d_position_embeddings = max_2d_position_embeddings + self.coordinate_size = coordinate_size + self.shape_size = shape_size + self.has_relative_attention_bias = has_relative_attention_bias + self.rel_pos_bins = rel_pos_bins + self.max_rel_pos = max_rel_pos + self.has_spatial_attention_bias = has_spatial_attention_bias + self.rel_2d_pos_bins = rel_2d_pos_bins + self.max_rel_2d_pos = max_rel_2d_pos + self.text_embed = text_embed + self.visual_embed = visual_embed + self.input_size = input_size + self.num_channels = num_channels + self.patch_size = patch_size + self.classifier_dropout = classifier_dropout + + +class LayoutLMv3OnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.12") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + # The order of inputs is different for question answering and sequence classification + if self.task in ["question-answering", "sequence-classification"]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ("bbox", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ] + ) + else: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("bbox", {0: "batch", 1: "sequence"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch", 1: "num_channels"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-5 + + @property + def default_onnx_opset(self) -> int: + return 12 + + def generate_dummy_inputs( + self, + processor: "ProcessorMixin", + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional["TensorType"] = None, + num_channels: int = 3, + image_width: int = 40, + image_height: int = 40, + ) -> Mapping[str, Any]: + """ + Generate inputs to provide to the ONNX exporter for the specific framework + + Args: + processor ([`ProcessorMixin`]): + The processor associated with this model configuration. + batch_size (`int`, *optional*, defaults to -1): + The batch size to export the model for (-1 means dynamic axis). + seq_length (`int`, *optional*, defaults to -1): + The sequence length to export the model for (-1 means dynamic axis). + is_pair (`bool`, *optional*, defaults to `False`): + Indicate if the input is a pair (sentence 1, sentence 2). + framework (`TensorType`, *optional*, defaults to `None`): + The framework (PyTorch or TensorFlow) that the processor will generate tensors for. + num_channels (`int`, *optional*, defaults to 3): + The number of channels of the generated images. + image_width (`int`, *optional*, defaults to 40): + The width of the generated images. + image_height (`int`, *optional*, defaults to 40): + The height of the generated images. + + Returns: + Mapping[str, Any]: holding the kwargs to provide to the model's forward function + """ + + # A dummy image is used so OCR should not be applied + setattr(processor.image_processor, "apply_ocr", False) + + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 + ) + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = processor.tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add + ) + # Generate dummy inputs according to compute batch and sequence + dummy_text = [[" ".join([processor.tokenizer.unk_token]) * seq_length]] * batch_size + + # Generate dummy bounding boxes + dummy_bboxes = [[[48, 84, 73, 128]]] * batch_size + + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + # batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch) + dummy_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width) + + inputs = dict( + processor( + dummy_image, + text=dummy_text, + boxes=dummy_bboxes, + return_tensors=framework, + ) + ) + + return inputs + + +__all__ = ["LayoutLMv3Config", "LayoutLMv3OnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py new file mode 100644 index 0000000000000000000000000000000000000000..8189abf673119cccbf87387f65e15f8dd43bde80 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py @@ -0,0 +1,380 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for LayoutLMv3.""" + +from collections.abc import Iterable +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import resize, to_channel_dimension_format, to_pil_image +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_pytesseract_available, + is_vision_available, + logging, + requires_backends, +) +from ...utils.import_utils import requires + + +if is_vision_available(): + import PIL + +# soft dependency +if is_pytesseract_available(): + import pytesseract + +logger = logging.get_logger(__name__) + + +def normalize_box(box, width, height): + return [ + int(1000 * (box[0] / width)), + int(1000 * (box[1] / height)), + int(1000 * (box[2] / width)), + int(1000 * (box[3] / height)), + ] + + +def apply_tesseract( + image: np.ndarray, + lang: Optional[str], + tesseract_config: Optional[str], + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.""" + + # apply OCR + pil_image = to_pil_image(image, input_data_format=input_data_format) + image_width, image_height = pil_image.size + data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config) + words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"] + + # filter empty words and corresponding coordinates + irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()] + words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices] + left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices] + top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices] + width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices] + height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices] + + # turn coordinates into (left, top, left+width, top+height) format + actual_boxes = [] + for x, y, w, h in zip(left, top, width, height): + actual_box = [x, y, x + w, y + h] + actual_boxes.append(actual_box) + + # finally, normalize the bounding boxes + normalized_boxes = [] + for box in actual_boxes: + normalized_boxes.append(normalize_box(box, image_width, image_height)) + + assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes" + + return words, normalized_boxes + + +@requires(backends=("vision",)) +class LayoutLMv3ImageProcessor(BaseImageProcessor): + r""" + Constructs a LayoutLMv3 image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be + overridden by `do_resize` in `preprocess`. + size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`): + Size of the image after resizing. Can be overridden by `size` in `preprocess`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image's pixel values by the specified `rescale_value`. Can be overridden by + `do_rescale` in `preprocess`. + rescale_factor (`float`, *optional*, defaults to 1 / 255): + Value by which the image's pixel values are rescaled. Can be overridden by `rescale_factor` in + `preprocess`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + apply_ocr (`bool`, *optional*, defaults to `True`): + Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by + the `apply_ocr` parameter in the `preprocess` method. + ocr_lang (`str`, *optional*): + The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is + used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method. + tesseract_config (`str`, *optional*): + Any additional custom configuration flags that are forwarded to the `config` parameter when calling + Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the + `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_value: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, Iterable[float]]] = None, + image_std: Optional[Union[float, Iterable[float]]] = None, + apply_ocr: bool = True, + ocr_lang: Optional[str] = None, + tesseract_config: Optional[str] = "", + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 224, "width": 224} + size = get_size_dict(size) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_value + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.apply_ocr = apply_ocr + self.ocr_lang = ocr_lang + self.tesseract_config = tesseract_config + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample=None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, Iterable[float]]] = None, + image_std: Optional[Union[float, Iterable[float]]] = None, + apply_ocr: Optional[bool] = None, + ocr_lang: Optional[str] = None, + tesseract_config: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Desired size of the output image after applying `resize`. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` filters. + Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image pixel values between [0, 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to apply to the image pixel values. Only has an effect if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `Iterable[float]`, *optional*, defaults to `self.image_mean`): + Mean values to be used for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `Iterable[float]`, *optional*, defaults to `self.image_std`): + Standard deviation values to be used for normalization. Only has an effect if `do_normalize` is set to + `True`. + apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`): + Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. + ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`): + The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is + used. + tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`): + Any additional custom configuration flags that are forwarded to the `config` parameter when calling + Tesseract. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size) + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + apply_ocr = apply_ocr if apply_ocr is not None else self.apply_ocr + ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang + tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # Tesseract OCR to get words + normalized bounding boxes + if apply_ocr: + requires_backends(self, "pytesseract") + words_batch = [] + boxes_batch = [] + for image in images: + words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format) + words_batch.append(words) + boxes_batch.append(boxes) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + if apply_ocr: + data["words"] = words_batch + data["boxes"] = boxes_batch + return data + + +__all__ = ["LayoutLMv3ImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py new file mode 100644 index 0000000000000000000000000000000000000000..1f1b6cead6077b391a993e346b6ab572f73e5380 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py @@ -0,0 +1,184 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for LayoutLMv3. +""" + +import warnings +from typing import Optional, Union + +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType + + +class LayoutLMv3Processor(ProcessorMixin): + r""" + Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a + single processor. + + [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model. + + It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to + get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or + [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, + `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned + into token-level `labels` for token classification tasks (such as FUNSD, CORD). + + Args: + image_processor (`LayoutLMv3ImageProcessor`, *optional*): + An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input. + tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*): + An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "LayoutLMv3ImageProcessor" + tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + feature_extractor = None + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None, + boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None, + word_labels: Optional[Union[list[int], list[list[int]]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchEncoding: + """ + This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case + [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and + bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, + together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with + `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along + with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with + resized and normalized `pixel_values`. + + Please refer to the docstring of the above two methods for more information. + """ + # verify input + if self.image_processor.apply_ocr and (boxes is not None): + raise ValueError( + "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True." + ) + + if self.image_processor.apply_ocr and (word_labels is not None): + raise ValueError( + "You cannot provide word labels if you initialized the image processor with apply_ocr set to True." + ) + + # first, apply the image processor + features = self.image_processor(images=images, return_tensors=return_tensors) + + # second, apply the tokenizer + if text is not None and self.image_processor.apply_ocr and text_pair is None: + if isinstance(text, str): + text = [text] # add batch dimension (as the image processor always adds a batch dimension) + text_pair = features["words"] + + encoded_inputs = self.tokenizer( + text=text if text is not None else features["words"], + text_pair=text_pair if text_pair is not None else None, + boxes=boxes if boxes is not None else features["boxes"], + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + + # add pixel values + images = features.pop("pixel_values") + if return_overflowing_tokens is True: + images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"]) + encoded_inputs["pixel_values"] = images + + return encoded_inputs + + def get_overflowing_images(self, images, overflow_to_sample_mapping): + # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image + images_with_overflow = [] + for sample_idx in overflow_to_sample_mapping: + images_with_overflow.append(images[sample_idx]) + + if len(images_with_overflow) != len(overflow_to_sample_mapping): + raise ValueError( + "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got" + f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}" + ) + + return images_with_overflow + + @property + def model_input_names(self): + return ["input_ids", "bbox", "attention_mask", "pixel_values"] + + @property + def feature_extractor_class(self): + warnings.warn( + "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.", + FutureWarning, + ) + return self.image_processor_class + + @property + def feature_extractor(self): + warnings.warn( + "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.", + FutureWarning, + ) + return self.image_processor + + +__all__ = ["LayoutLMv3Processor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6f5a662e6afd2c83768e9731d896da83b0b89193 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .processing_layoutxlm import * + from .tokenization_layoutxlm import * + from .tokenization_layoutxlm_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py new file mode 100644 index 0000000000000000000000000000000000000000..e3ece89f434ba1ab69a1ec2ded35722559c32611 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py @@ -0,0 +1,185 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for LayoutXLM. +""" + +import warnings +from typing import Optional, Union + +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType + + +class LayoutXLMProcessor(ProcessorMixin): + r""" + Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single + processor. + + [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model. + + It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to + get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or + [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`, + `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned + into token-level `labels` for token classification tasks (such as FUNSD, CORD). + + Args: + image_processor (`LayoutLMv2ImageProcessor`, *optional*): + An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input. + tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*): + An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "LayoutLMv2ImageProcessor" + tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None, + boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None, + word_labels: Optional[Union[list[int], list[list[int]]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchEncoding: + """ + This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case + [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and + bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, + together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to + `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional + arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``. + + Please refer to the docstring of the above two methods for more information. + """ + # verify input + if self.image_processor.apply_ocr and (boxes is not None): + raise ValueError( + "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True." + ) + + if self.image_processor.apply_ocr and (word_labels is not None): + raise ValueError( + "You cannot provide word labels if you initialized the image processor with apply_ocr set to True." + ) + + if return_overflowing_tokens is True and return_offsets_mapping is False: + raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.") + + # first, apply the image processor + features = self.image_processor(images=images, return_tensors=return_tensors) + + # second, apply the tokenizer + if text is not None and self.image_processor.apply_ocr and text_pair is None: + if isinstance(text, str): + text = [text] # add batch dimension (as the image processor always adds a batch dimension) + text_pair = features["words"] + + encoded_inputs = self.tokenizer( + text=text if text is not None else features["words"], + text_pair=text_pair if text_pair is not None else None, + boxes=boxes if boxes is not None else features["boxes"], + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + + # add pixel values + images = features.pop("pixel_values") + if return_overflowing_tokens is True: + images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"]) + encoded_inputs["image"] = images + + return encoded_inputs + + def get_overflowing_images(self, images, overflow_to_sample_mapping): + # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image + images_with_overflow = [] + for sample_idx in overflow_to_sample_mapping: + images_with_overflow.append(images[sample_idx]) + + if len(images_with_overflow) != len(overflow_to_sample_mapping): + raise ValueError( + "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got" + f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}" + ) + + return images_with_overflow + + @property + def model_input_names(self): + return ["input_ids", "bbox", "attention_mask", "image"] + + @property + def feature_extractor_class(self): + warnings.warn( + "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.", + FutureWarning, + ) + return self.image_processor_class + + @property + def feature_extractor(self): + warnings.warn( + "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.", + FutureWarning, + ) + return self.image_processor + + +__all__ = ["LayoutXLMProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py new file mode 100644 index 0000000000000000000000000000000000000000..3dbe3c21a6365b5a20988626cc57520ea37d5fc3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py @@ -0,0 +1,1191 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for LayoutXLM model.""" + +import os +from shutil import copyfile +from typing import Any, Optional, Union + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_utils_base import ( + BatchEncoding, + EncodedInput, + PreTokenizedInput, + TextInput, + TextInputPair, + TruncationStrategy, +) +from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging +from ...utils.import_utils import requires +from ..xlm_roberta.tokenization_xlm_roberta import ( + SPIECE_UNDERLINE, + VOCAB_FILES_NAMES, +) + + +logger = logging.get_logger(__name__) + + +LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r""" + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to encode the sequences with the special tokens relative to their model. + padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to `None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + stride (`int`, *optional*, defaults to 0): + If set to a number along with `max_length`, the overflowing tokens returned when + `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + return_token_type_ids (`bool`, *optional*): + Whether to return token type IDs. If left to the default, will return the token type IDs according to + the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are token type IDs?](../glossary#token-type-ids) + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_overflowing_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch + of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead + of returning overflowing tokens. + return_special_tokens_mask (`bool`, *optional*, defaults to `False`): + Whether or not to return special tokens mask information. + return_offsets_mapping (`bool`, *optional*, defaults to `False`): + Whether or not to return `(char_start, char_end)` for each token. + + This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using + Python's tokenizer, this method will raise `NotImplementedError`. + return_length (`bool`, *optional*, defaults to `False`): + Whether or not to return the lengths of the encoded inputs. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + **kwargs: passed to the `self.tokenize()` method + + Return: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + + [What are input IDs?](../glossary#input-ids) + + - **bbox** -- List of bounding boxes to be fed to a model. + + - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or + if *"token_type_ids"* is in `self.model_input_names`). + + [What are token type IDs?](../glossary#token-type-ids) + + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`). + + [What are attention masks?](../glossary#attention-mask) + + - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified). + - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying + regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`). + - **length** -- The length of the inputs (when `return_length=True`). +""" + + +@requires(backends=("sentencepiece",)) +class LayoutXLMTokenizer(PreTrainedTokenizer): + """ + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`): + The bounding box to use for the special [CLS] token. + sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`): + The bounding box to use for the special [SEP] token. + pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`): + The bounding box to use for the special [PAD] token. + pad_token_label (`int`, *optional*, defaults to -100): + The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's + CrossEntropyLoss. + only_label_first_subword (`bool`, *optional*, defaults to `True`): + Whether or not to only label the first subword, in case word labels are provided. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + cls_token_box=[0, 0, 0, 0], + sep_token_box=[1000, 1000, 1000, 1000], + pad_token_box=[0, 0, 0, 0], + pad_token_label=-100, + only_label_first_subword=True, + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # Original fairseq vocab and spm vocab must be "aligned": + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- + # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' + # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + + # Mimic fairseq token-to-id alignment for the first 4 token + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + + # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab + self.fairseq_offset = 1 + + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + # additional properties + self.cls_token_box = cls_token_box + self.sep_token_box = sep_token_box + self.pad_token_box = pad_token_box + self.pad_token_label = pad_token_label + self.only_label_first_subword = only_label_first_subword + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + cls_token_box=cls_token_box, + sep_token_box=sep_token_box, + pad_token_box=pad_token_box, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__.update(d) + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLM-RoBERTa sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.sp_model) + self.fairseq_offset + 1 # Add the token + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], + text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None, + boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None, + word_labels: Optional[Union[list[int], list[list[int]]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences with word-level normalized bounding boxes and optional labels. + + Args: + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings + (words of a single example or questions of a batch of examples) or a list of list of strings (batch of + words). + text_pair (`list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence should be a list of strings + (pretokenized string). + boxes (`list[list[int]]`, `list[list[list[int]]]`): + Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale. + word_labels (`list[int]`, `list[list[int]]`, *optional*): + Word-level integer labels (for token classification tasks such as FUNSD, CORD). + """ + + # Input type checking for clearer error + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], str): + # ... list of strings + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings + return len(t[0]) == 0 or isinstance(t[0][0], str) + else: + return False + else: + return False + + if text_pair is not None: + # in case text + text_pair are provided, text = questions, text_pair = words + if not _is_valid_text_input(text): + raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ") + if not isinstance(text_pair, (list, tuple)): + raise ValueError( + "words must of type `list[str]` (single pretokenized example), " + "or `list[list[str]]` (batch of pretokenized examples)." + ) + else: + # in case only text is provided => must be words + if not isinstance(text, (list, tuple)): + raise ValueError( + "Words must of type `list[str]` (single pretokenized example), " + "or `list[list[str]]` (batch of pretokenized examples)." + ) + + if text_pair is not None: + is_batched = isinstance(text, (list, tuple)) + else: + is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + + words = text if text_pair is None else text_pair + if boxes is None: + raise ValueError("You must provide corresponding bounding boxes") + if is_batched: + if len(words) != len(boxes): + raise ValueError("You must provide words and boxes for an equal amount of examples") + for words_example, boxes_example in zip(words, boxes): + if len(words_example) != len(boxes_example): + raise ValueError("You must provide as many words as there are bounding boxes") + else: + if len(words) != len(boxes): + raise ValueError("You must provide as many words as there are bounding boxes") + + if is_batched: + if text_pair is not None and len(text) != len(text_pair): + raise ValueError( + f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:" + f" {len(text_pair)}." + ) + batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text + is_pair = bool(text_pair is not None) + return self.batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + is_pair=is_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + text=text, + text_pair=text_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + list[TextInput], + list[TextInputPair], + list[PreTokenizedInput], + ], + is_pair: Optional[bool] = None, + boxes: Optional[list[list[list[int]]]] = None, + word_labels: Optional[list[list[int]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + ) + + batch_outputs = self._batch_prepare_for_model( + batch_text_or_text_pairs=batch_text_or_text_pairs, + is_pair=is_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=return_tensors, + verbose=verbose, + ) + + return BatchEncoding(batch_outputs) + + @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING) + def _batch_prepare_for_model( + self, + batch_text_or_text_pairs, + is_pair: Optional[bool] = None, + boxes: Optional[list[list[int]]] = None, + word_labels: Optional[list[list[int]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + ) -> BatchEncoding: + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens + + Args: + batch_ids_pairs: list of tokenized input ids or input ids pairs + """ + + batch_outputs = {} + for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)): + batch_text_or_text_pair, boxes_example = example + outputs = self.prepare_for_model( + batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair, + batch_text_or_text_pair[1] if is_pair else None, + boxes_example, + word_labels=word_labels[idx] if word_labels is not None else None, + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward + return_attention_mask=False, # we pad in batch afterward + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return batch_outputs + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput], + text_pair: Optional[PreTokenizedInput] = None, + boxes: Optional[list[list[int]]] = None, + word_labels: Optional[list[int]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast. " + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + return self.prepare_for_model( + text=text, + text_pair=text_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING) + def prepare_for_model( + self, + text: Union[TextInput, PreTokenizedInput], + text_pair: Optional[PreTokenizedInput] = None, + boxes: Optional[list[list[int]]] = None, + word_labels: Optional[list[int]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + prepend_batch_axis: bool = False, + **kwargs, + ) -> BatchEncoding: + """ + Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens, + truncates sequences if overflowing while taking into account the special tokens and manages a moving window + (with user defined stride) for overflowing tokens. + + Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into + token-level `labels`. The word label is used for the first token of the word, while remaining tokens are + labeled with -100, such that they will be ignored by the loss function. + + Args: + text (`str`, `list[str]`, `list[list[str]]`): + The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings. + text_pair (`list[str]` or `list[int]`, *optional*): + Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a + list of list of strings (words of a batch of examples). + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + tokens = [] + pair_tokens = [] + token_boxes = [] + pair_token_boxes = [] + labels = [] + + if text_pair is None: + if word_labels is None: + # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference) + for word, box in zip(text, boxes): + if len(word) < 1: # skip empty words + continue + word_tokens = self.tokenize(word) + tokens.extend(word_tokens) + token_boxes.extend([box] * len(word_tokens)) + else: + # CASE 2: token classification (training) + for word, box, label in zip(text, boxes, word_labels): + if len(word) < 1: # skip empty words + continue + word_tokens = self.tokenize(word) + tokens.extend(word_tokens) + token_boxes.extend([box] * len(word_tokens)) + if self.only_label_first_subword: + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1)) + else: + labels.extend([label] * len(word_tokens)) + else: + # CASE 3: document visual question answering (inference) + # text = question + # text_pair = words + tokens = self.tokenize(text) + token_boxes = [self.pad_token_box for _ in range(len(tokens))] + [self.sep_token_box] + + for word, box in zip(text_pair, boxes): + if len(word) < 1: # skip empty words + continue + word_tokens = self.tokenize(word) + pair_tokens.extend(word_tokens) + pair_token_boxes.extend([box] * len(word_tokens)) + + # Create ids + pair_ids + ids = self.convert_tokens_to_ids(tokens) + pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None + + # Compute the total size of the returned encodings + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) + + # Truncation: Handle max sequence length + overflowing_tokens = [] + overflowing_token_boxes = [] + overflowing_labels = [] + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: + ( + ids, + token_boxes, + pair_ids, + pair_token_boxes, + labels, + overflowing_tokens, + overflowing_token_boxes, + overflowing_labels, + ) = self.truncate_sequences( + ids, + token_boxes, + pair_ids=pair_ids, + pair_token_boxes=pair_token_boxes, + labels=labels, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + + if return_token_type_ids and not add_special_tokens: + raise ValueError( + "Asking to return token_type_ids while setting add_special_tokens to False " + "results in an undefined behavior. Please set add_special_tokens to True or " + "set return_token_type_ids to None." + ) + + # Load from model defaults + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes + encoded_inputs["overflowing_labels"] = overflowing_labels + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + # Add special tokens + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box] + if pair_token_boxes: + pair_token_boxes = pair_token_boxes + [self.sep_token_box] + if labels: + labels = [self.pad_token_label] + labels + [self.pad_token_label] + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) + + # Build output dictionary + encoded_inputs["input_ids"] = sequence + encoded_inputs["bbox"] = token_boxes + pair_token_boxes + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(sequence) + + if labels: + encoded_inputs["labels"] = labels + + # Check lengths + self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose) + + # Padding + if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: + encoded_inputs = self.pad( + encoded_inputs, + max_length=max_length, + padding=padding_strategy.value, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + if return_length: + encoded_inputs["length"] = len(encoded_inputs["input_ids"]) + + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis + ) + + return batch_outputs + + def truncate_sequences( + self, + ids: list[int], + token_boxes: list[list[int]], + pair_ids: Optional[list[int]] = None, + pair_token_boxes: Optional[list[list[int]]] = None, + labels: Optional[list[int]] = None, + num_tokens_to_remove: int = 0, + truncation_strategy: Union[str, TruncationStrategy] = "longest_first", + stride: int = 0, + ) -> tuple[list[int], list[int], list[int]]: + """ + Truncates a sequence pair in-place following the strategy. + + Args: + ids (`list[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and + `convert_tokens_to_ids` methods. + token_boxes (`list[list[int]]`): + Bounding boxes of the first sequence. + pair_ids (`list[int]`, *optional*): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` + and `convert_tokens_to_ids` methods. + pair_token_boxes (`list[list[int]]`, *optional*): + Bounding boxes of the second sequence. + labels (`list[int]`, *optional*): + Labels of the first sequence (for token classification tasks). + num_tokens_to_remove (`int`, *optional*, defaults to 0): + Number of tokens to remove using the truncation strategy. + truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + The strategy to follow for truncation. Can be: + + - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will truncate + token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a + batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater + than the model maximum admissible input size). + stride (`int`, *optional*, defaults to 0): + If set to a positive number, the overflowing tokens returned will contain some tokens from the main + sequence returned. The value of this argument defines the number of additional tokens. + + Returns: + `tuple[list[int], list[int], list[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of + overflowing tokens. + """ + if num_tokens_to_remove <= 0: + return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], [] + + if not isinstance(truncation_strategy, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation_strategy) + + overflowing_tokens = [] + overflowing_token_boxes = [] + overflowing_labels = [] + if truncation_strategy == TruncationStrategy.LONGEST_FIRST: + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + if not overflowing_tokens: + window_len = min(len(ids), stride + 1) + else: + window_len = 1 + overflowing_tokens.extend(ids[-window_len:]) + overflowing_token_boxes.extend(token_boxes[-window_len:]) + overflowing_labels.extend(labels[-window_len:]) + ids = ids[:-1] + token_boxes = token_boxes[:-1] + labels = labels[:-1] + else: + if not overflowing_tokens: + window_len = min(len(pair_ids), stride + 1) + else: + window_len = 1 + overflowing_tokens.extend(pair_ids[-window_len:]) + overflowing_token_boxes.extend(pair_token_boxes[-window_len:]) + pair_ids = pair_ids[:-1] + pair_token_boxes = pair_token_boxes[:-1] + elif truncation_strategy == TruncationStrategy.ONLY_FIRST: + if len(ids) > num_tokens_to_remove: + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + overflowing_token_boxes = token_boxes[-window_len:] + overflowing_labels = labels[-window_len:] + ids = ids[:-num_tokens_to_remove] + token_boxes = token_boxes[:-num_tokens_to_remove] + labels = labels[:-num_tokens_to_remove] + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the first sequence has a length {len(ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + "for instance 'longest_first' or 'only_second'." + ) + elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: + if len(pair_ids) > num_tokens_to_remove: + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + overflowing_token_boxes = pair_token_boxes[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove] + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the second sequence has a length {len(pair_ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + "for instance 'longest_first' or 'only_first'." + ) + + return ( + ids, + token_boxes, + pair_ids, + pair_token_boxes, + labels, + overflowing_tokens, + overflowing_token_boxes, + overflowing_labels, + ) + + def _pad( + self, + encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + required_input = encoded_inputs[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if return_attention_mask and "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * len(required_input) + + if needs_to_be_padded: + difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference + if "labels" in encoded_inputs: + encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + elif padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"] + if "labels" in encoded_inputs: + encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + raise ValueError("Invalid padding strategy:" + str(padding_side)) + + return encoded_inputs + + +__all__ = ["LayoutXLMTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..6710c6c8cb66ed08da2df391c28ad1db2e6cf81d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py @@ -0,0 +1,815 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for LayoutXLM model.""" + +import os +from shutil import copyfile +from typing import Optional, Union + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_base import ( + BatchEncoding, + EncodedInput, + PreTokenizedInput, + TextInput, + TextInputPair, + TruncationStrategy, +) +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging +from ..xlm_roberta.tokenization_xlm_roberta_fast import ( + VOCAB_FILES_NAMES, +) + + +if is_sentencepiece_available(): + from .tokenization_layoutxlm import LayoutXLMTokenizer +else: + LayoutXLMTokenizer = None + + +logger = logging.get_logger(__name__) + +LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r""" + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to encode the sequences with the special tokens relative to their model. + padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to `None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + stride (`int`, *optional*, defaults to 0): + If set to a number along with `max_length`, the overflowing tokens returned when + `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + return_token_type_ids (`bool`, *optional*): + Whether to return token type IDs. If left to the default, will return the token type IDs according to + the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are token type IDs?](../glossary#token-type-ids) + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_overflowing_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch + of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead + of returning overflowing tokens. + return_special_tokens_mask (`bool`, *optional*, defaults to `False`): + Whether or not to return special tokens mask information. + return_offsets_mapping (`bool`, *optional*, defaults to `False`): + Whether or not to return `(char_start, char_end)` for each token. + + This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using + Python's tokenizer, this method will raise `NotImplementedError`. + return_length (`bool`, *optional*, defaults to `False`): + Whether or not to return the lengths of the encoded inputs. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + **kwargs: passed to the `self.tokenize()` method + + Return: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + + [What are input IDs?](../glossary#input-ids) + + - **bbox** -- List of bounding boxes to be fed to a model. + + - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or + if *"token_type_ids"* is in `self.model_input_names`). + + [What are token type IDs?](../glossary#token-type-ids) + + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`). + + [What are attention masks?](../glossary#attention-mask) + + - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified). + - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying + regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`). + - **length** -- The length of the inputs (when `return_length=True`). +""" + + +class LayoutXLMTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from + [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`): + The bounding box to use for the special [CLS] token. + sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`): + The bounding box to use for the special [SEP] token. + pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`): + The bounding box to use for the special [PAD] token. + pad_token_label (`int`, *optional*, defaults to -100): + The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's + CrossEntropyLoss. + only_label_first_subword (`bool`, *optional*, defaults to `True`): + Whether or not to only label the first subword, in case word labels are provided. + additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = LayoutXLMTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + cls_token_box=[0, 0, 0, 0], + sep_token_box=[1000, 1000, 1000, 1000], + pad_token_box=[0, 0, 0, 0], + pad_token_label=-100, + only_label_first_subword=True, + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + cls_token_box=cls_token_box, + sep_token_box=sep_token_box, + pad_token_box=pad_token_box, + pad_token_label=pad_token_label, + only_label_first_subword=only_label_first_subword, + **kwargs, + ) + + self.vocab_file = vocab_file + + # additional properties + self.cls_token_box = cls_token_box + self.sep_token_box = sep_token_box + self.pad_token_box = pad_token_box + self.pad_token_label = pad_token_label + self.only_label_first_subword = only_label_first_subword + + @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], + text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None, + boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None, + word_labels: Optional[Union[list[int], list[list[int]]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences with word-level normalized bounding boxes and optional labels. + + Args: + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings + (words of a single example or questions of a batch of examples) or a list of list of strings (batch of + words). + text_pair (`list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence should be a list of strings + (pretokenized string). + boxes (`list[list[int]]`, `list[list[list[int]]]`): + Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale. + word_labels (`list[int]`, `list[list[int]]`, *optional*): + Word-level integer labels (for token classification tasks such as FUNSD, CORD). + """ + + # Input type checking for clearer error + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], str): + # ... list of strings + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings + return len(t[0]) == 0 or isinstance(t[0][0], str) + else: + return False + else: + return False + + if text_pair is not None: + # in case text + text_pair are provided, text = questions, text_pair = words + if not _is_valid_text_input(text): + raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ") + if not isinstance(text_pair, (list, tuple)): + raise ValueError( + "words must of type `list[str]` (single pretokenized example), " + "or `list[list[str]]` (batch of pretokenized examples)." + ) + else: + # in case only text is provided => must be words + if not isinstance(text, (list, tuple)): + raise ValueError( + "Words must of type `list[str]` (single pretokenized example), " + "or `list[list[str]]` (batch of pretokenized examples)." + ) + + if text_pair is not None: + is_batched = isinstance(text, (list, tuple)) + else: + is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + + words = text if text_pair is None else text_pair + if boxes is None: + raise ValueError("You must provide corresponding bounding boxes") + if is_batched: + if len(words) != len(boxes): + raise ValueError("You must provide words and boxes for an equal amount of examples") + for words_example, boxes_example in zip(words, boxes): + if len(words_example) != len(boxes_example): + raise ValueError("You must provide as many words as there are bounding boxes") + else: + if len(words) != len(boxes): + raise ValueError("You must provide as many words as there are bounding boxes") + + if is_batched: + if text_pair is not None and len(text) != len(text_pair): + raise ValueError( + f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:" + f" {len(text_pair)}." + ) + batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text + is_pair = bool(text_pair is not None) + return self.batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + is_pair=is_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + text=text, + text_pair=text_pair, + boxes=boxes, + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]: + batched_input = [(text, pair)] if pair else [text] + + self._tokenizer.encode_special_tokens = kwargs.pop( + "split_special_tokens", self._tokenizer.encode_special_tokens + ) + + encodings = self._tokenizer.encode_batch( + batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs + ) + + return encodings[0].tokens + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + list[TextInput], + list[TextInputPair], + list[PreTokenizedInput], + ], + is_pair: Optional[bool] = None, + boxes: Optional[list[list[list[int]]]] = None, + word_labels: Optional[list[list[int]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if not isinstance(batch_text_or_text_pairs, list): + raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") + + # Set the truncation and padding strategy and restore the initial configuration + self.set_truncation_and_padding( + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + ) + + if is_pair: + batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs] + + encodings = self._tokenizer.encode_batch( + batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + is_pretokenized=True, # we set this to True as LayoutLMv2 always expects pretokenized inputs + ) + + # Convert encoding to dict + # `Tokens` has type: tuple[ + # list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]], + # list[EncodingFast] + # ] + # with nested dimensions corresponding to batch, overflows, sequence length + tokens_and_encodings = [ + self._convert_encoding( + encoding=encoding, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=True + if word_labels is not None + else return_offsets_mapping, # we use offsets to create the labels + return_length=return_length, + verbose=verbose, + ) + for encoding in encodings + ] + + # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension + # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length) + # (we say ~ because the number of overflow varies with the example in the batch) + # + # To match each overflowing sample with the original sample in the batch + # we add an overflow_to_sample_mapping array (see below) + sanitized_tokens = {} + for key in tokens_and_encodings[0][0]: + stack = [e for item, _ in tokens_and_encodings for e in item[key]] + sanitized_tokens[key] = stack + sanitized_encodings = [e for _, item in tokens_and_encodings for e in item] + + # If returning overflowing tokens, we need to return a mapping + # from the batch idx to the original sample + if return_overflowing_tokens: + overflow_to_sample_mapping = [] + for i, (toks, _) in enumerate(tokens_and_encodings): + overflow_to_sample_mapping += [i] * len(toks["input_ids"]) + sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping + + for input_ids in sanitized_tokens["input_ids"]: + self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose) + + # create the token boxes + token_boxes = [] + for batch_index in range(len(sanitized_tokens["input_ids"])): + if return_overflowing_tokens: + original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index] + else: + original_index = batch_index + token_boxes_example = [] + for id, sequence_id, word_id in zip( + sanitized_tokens["input_ids"][batch_index], + sanitized_encodings[batch_index].sequence_ids, + sanitized_encodings[batch_index].word_ids, + ): + if word_id is not None: + if is_pair and sequence_id == 0: + token_boxes_example.append(self.pad_token_box) + else: + token_boxes_example.append(boxes[original_index][word_id]) + else: + if id == self.cls_token_id: + token_boxes_example.append(self.cls_token_box) + elif id == self.sep_token_id: + token_boxes_example.append(self.sep_token_box) + elif id == self.pad_token_id: + token_boxes_example.append(self.pad_token_box) + else: + raise ValueError("Id not recognized") + token_boxes.append(token_boxes_example) + + sanitized_tokens["bbox"] = token_boxes + + # optionally, create the labels + if word_labels is not None: + labels = [] + for batch_index in range(len(sanitized_tokens["input_ids"])): + if return_overflowing_tokens: + original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index] + else: + original_index = batch_index + labels_example = [] + for id, offset, word_id in zip( + sanitized_tokens["input_ids"][batch_index], + sanitized_tokens["offset_mapping"][batch_index], + sanitized_encodings[batch_index].word_ids, + ): + if word_id is not None: + if self.only_label_first_subword: + if offset[0] == 0: + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + labels_example.append(word_labels[original_index][word_id]) + else: + labels_example.append(self.pad_token_label) + else: + labels_example.append(word_labels[original_index][word_id]) + else: + labels_example.append(self.pad_token_label) + labels.append(labels_example) + + sanitized_tokens["labels"] = labels + # finally, remove offsets if the user didn't want them + if not return_offsets_mapping: + del sanitized_tokens["offset_mapping"] + + return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors) + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput], + text_pair: Optional[PreTokenizedInput] = None, + boxes: Optional[list[list[int]]] = None, + word_labels: Optional[list[int]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[bool] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + # make it a batched input + # 2 options: + # 1) only text, in case text must be a list of str + # 2) text + text_pair, in which case text = str and text_pair a list of str + batched_input = [(text, text_pair)] if text_pair else [text] + batched_boxes = [boxes] + batched_word_labels = [word_labels] if word_labels is not None else None + batched_output = self._batch_encode_plus( + batched_input, + is_pair=bool(text_pair is not None), + boxes=batched_boxes, + word_labels=batched_word_labels, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + # Return tensor is None, then we can remove the leading batch axis + # Overflowing tokens are returned as a batch of output so we keep them in this case + if return_tensors is None and not return_overflowing_tokens: + batched_output = BatchEncoding( + { + key: value[0] if len(value) > 0 and isinstance(value[0], list) else value + for key, value in batched_output.items() + }, + batched_output.encodings, + ) + + self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose) + + return batched_output + + def _pad( + self, + encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + required_input = encoded_inputs[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if return_attention_mask and "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * len(required_input) + + if needs_to_be_padded: + difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference + if "labels" in encoded_inputs: + encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + elif padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if "bbox" in encoded_inputs: + encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"] + if "labels" in encoded_inputs: + encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + raise ValueError("Invalid padding strategy:" + str(padding_side)) + + return encoded_inputs + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLM-RoBERTa sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory.") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["LayoutXLMTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13bef0d263b1e570728f7593eb606ec4491a2310 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe54f28b76db1f8889c771322e20c5eb94e2e8be Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fb863f97d75af5cc8df546f2636b74716ffcf7b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a9dd80ff334ead19a4de4f2c946098e5756d7d6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/image_processing_llama4_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/image_processing_llama4_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..543f9f65d03d97995b9dd1d9c8e00ce265714e47 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/image_processing_llama4_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aadd45dc13ed4074437cc6f224b0348de110f292 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_llava import * + from .image_processing_llava_fast import * + from .modeling_llava import * + from .processing_llava import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e960aeeb0023de5ae54bf6107ed10f87eb01dbbd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/configuration_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/configuration_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa366fd24ecde516610c83eead5646da88b3c7d1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/configuration_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..406e8fae9dee06d986176d158742cd72e7b9ea3c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4606589f99a2bd97a42982e2a8251fc222213aea Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/modeling_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/modeling_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e27e0d3caa375ac8c2605aab367328280fe42e8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/modeling_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/processing_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/processing_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2579fd4956d0f6f5b92399b30eb38365e4319c1a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/processing_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/configuration_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/configuration_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..9ae710c011986f422d965adcdb758b29e3113690 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/configuration_llava.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2023 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Llava model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class LlavaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an + Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Llava-9B. + + e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + image_token_index (`int`, *optional*, defaults to 32000): + The image token index to encode the image prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. + vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + image_seq_length (`int`, *optional*, defaults to 576): + Sequence length of one image embedding. + multimodal_projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. + + Example: + + ```python + >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig + + >>> # Initializing a CLIP-vision config + >>> vision_config = CLIPVisionConfig() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a Llava llava-1.5-7b style configuration + >>> configuration = LlavaConfig(vision_config, text_config) + + >>> # Initializing a model from the llava-1.5-7b style configuration + >>> model = LlavaForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_index=32000, + projector_hidden_act="gelu", + vision_feature_select_strategy="default", + vision_feature_layer=-2, + image_seq_length=576, + multimodal_projector_bias=True, + **kwargs, + ): + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.image_seq_length = image_seq_length + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["clip_vision_model"]( + intermediate_size=4096, + hidden_size=1024, + patch_size=14, + image_size=336, + num_hidden_layers=24, + num_attention_heads=16, + vocab_size=32000, + projection_dim=768, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["llama"]() + + self.text_config = text_config + self.multimodal_projector_bias = multimodal_projector_bias + + super().__init__(**kwargs) + + +__all__ = ["LlavaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..5420d6fe291861951c7e6b5bfc555b81c57cbc90 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava.py @@ -0,0 +1,437 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for LLaVa.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + convert_to_rgb, + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_vision_available, logging + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +class LlavaImageProcessor(BaseImageProcessor): + r""" + Constructs a LLaVa image processor. + + Args: + do_pad (`bool`, *optional*, defaults to `False`): + Whether to pad the image to a square based on the longest edge. + The padding value is determined by the `image_mean` parameter. + Can be overridden by `do_pad` in the `preprocess` method. + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_pad: bool = False, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_pad = do_pad + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + self._valid_processor_keys = [ + "images", + "do_pad", + "do_resize", + "size", + "resample", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_convert_rgb", + "return_tensors", + "data_format", + "input_data_format", + ] + + def pad_to_square( + self, + image: np.ndarray, + background_color: Union[int, tuple[int, int, int]] = 0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads an image to a square based on the longest edge. + + Args: + image (`np.ndarray`): + The image to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in multi-channel mode, it will default to `0` in subsequent channels. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + """ + height, width = get_image_size(image, input_data_format) + num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1] + + if height == width: + image = ( + to_channel_dimension_format(image, data_format, input_data_format) + if data_format is not None + else image + ) + return image + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + if input_data_format == ChannelDimension.FIRST: + result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype) + for i, color in enumerate(background_color): + result[i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + result[:, start : start + height, :] = image + else: + start = (max_dim - width) // 2 + result[:, :, start : start + width] = image + else: + result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype) + for i, color in enumerate(background_color): + result[:, :, i] = color + if width > height: + start = (max_dim - height) // 2 + result[start : start + height, :, :] = image + else: + start = (max_dim - width) // 2 + result[:, start : start + width, :] = image + + image = ( + to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result + ) + return image + + # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def preprocess( + self, + images: ImageInput, + do_pad: Optional[bool] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to a square based on the longest edge. + The padding value is determined by the `image_mean` parameter. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_pad = do_pad if do_pad is not None else self.do_pad + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + # we don't pass `do_pad` here since LLaVa uses a custom padding to a square + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + processed_images = [] + for image in images: + if do_pad: + image = self.pad_to_square( + image=image, + background_color=tuple(int(x * 255) for x in self.image_mean), + input_data_format=input_data_format, + ) + + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + processed_images.append(image) + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + +__all__ = ["LlavaImageProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..5960700405494208325f970ee3a1495cfddc6a87 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava_fast.py @@ -0,0 +1,171 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for LLaVa.""" + +from typing import Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, + get_image_size, +) +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, +) + + +class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ... + + +@auto_docstring +class LlavaImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + default_to_square = False + crop_size = {"height": 224, "width": 224} + do_pad = False + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + valid_kwargs = LlavaFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None: + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def pad_to_square( + self, + images: "torch.Tensor", + background_color: Union[int, tuple[int, int, int]] = 0, + ) -> "torch.Tensor": + """ + Pads an image to a square based on the longest edge. + + Args: + images (`np.ndarray`): + The images to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in multi-channel mode, it will default to `0` in subsequent channels. + Returns: + `torch.Tensor`: The padded images. + """ + height, width = get_image_size(images, ChannelDimension.FIRST) + + if height == width: + return images + + num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0] + if isinstance(background_color, int): + background_color = [background_color] + [0] * (num_channels - 1) + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + max_dim = max(height, width) + paste_x_left = (max_dim - width) // 2 + paste_y_left = (max_dim - height) // 2 + paste_x_right = max_dim - width - paste_x_left + paste_y_right = max_dim - height - paste_y_left + padded_images = F.pad( + images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color + ) + + return padded_images + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_pad: bool, + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_pad: + stacked_images = self.pad_to_square( + images=stacked_images, background_color=tuple(int(x * 255) for x in self.image_mean) + ) + resized_images_grouped[shape] = stacked_images + padded_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for batched resizing + # Needed in case do_pad is False, or padding returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(padded_images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + +__all__ = ["LlavaImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/modeling_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/modeling_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0bb0df7c7baa60aaee0f2fd585102368bea1db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/modeling_llava.py @@ -0,0 +1,484 @@ +# coding=utf-8 +# Copyright 2023 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Llava model.""" + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ..auto import AutoModel +from .configuration_llava import LlavaConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Llava outputs, with hidden states and attentions. + """ +) +class LlavaModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Llava causal language model (or autoregressive) outputs. + """ +) +class LlavaCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +class LlavaMultiModalProjector(nn.Module): + def __init__(self, config: LlavaConfig): + super().__init__() + # We have hidden_size * the number of vision feature layers + num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * num_feature_layers, + config.text_config.hidden_size, + bias=config.multimodal_projector_bias, + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear( + config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) + + def forward(self, image_features): + hidden_states = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +@auto_docstring +class LlavaPreTrainedModel(PreTrainedModel): + config: LlavaConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_flex_attn = True + _supports_attention_backend = True + + +@auto_docstring( + custom_intro=""" + The Llava model which consists of a vision backbone and a language model, without a language modeling head. + """ +) +class LlavaModel(LlavaPreTrainedModel): + _checkpoint_conversion_mapping = {"language_model.model": "language_model"} + + def __init__(self, config: LlavaConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = LlavaMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): + The tensors corresponding to the input images. + vision_feature_layer (`Union[int, list[int]]`, *optional*): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + + kwargs = {k: v for k, v in kwargs.items() if v is not None} + # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs) + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + else: + hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + # For default; crop CLS from each hidden state in the hidden state pool + if vision_feature_select_strategy == "default": + hs_pool = [hs[:, 1:] for hs in hs_pool] + selected_image_feature = torch.cat(hs_pool, dim=-1) + + image_features = self.multi_modal_projector(selected_image_feature) + + if "image_sizes" in kwargs: + split_sizes = [ + (height // self.vision_tower.patch_size) * (width // self.vision_tower.patch_size) + for height, width in kwargs["image_sizes"] + ] + image_features = torch.split(image_features.squeeze(0), split_sizes) + else: + image_features = list(image_features) + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + cache_position: Optional[torch.LongTensor] = None, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, LlavaModelOutputWithPast]: + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + image_sizes=image_sizes, + ) + image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + return LlavaModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The LLAVA model which consists of a vision backbone and a language model. + """ +) +class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = { + "^language_model.model": "model.language_model", + "^vision_tower": "model.vision_tower", + "^multi_modal_projector": "model.multi_modal_projector", + "^language_model.lm_head": "lm_head", + } + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: LlavaConfig): + super().__init__(config) + self.model = LlavaModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, LlavaCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, LlavaForConditionalGeneration + + >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf") + >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + + >>> prompt = "USER: \nWhat's the content of the image? ASSISTANT:" + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs, max_new_tokens=15) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed" + ```""" + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + cache_position=cache_position, + image_sizes=image_sizes, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return LlavaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["LlavaForConditionalGeneration", "LlavaPreTrainedModel", "LlavaModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/processing_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/processing_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..63c07c20cbb9974fb6d2597e88e5224cd0111a52 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/processing_llava.py @@ -0,0 +1,212 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Llava. +""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, get_image_size, to_numpy_array +from ...processing_utils import ( + MultiModalData, + ProcessingKwargs, + ProcessorMixin, + Unpack, +) +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class LlavaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False, "return_mm_token_type_ids": False}, + "images_kwargs": {}, + } + + +class LlavaProcessor(ProcessorMixin): + r""" + Constructs a LLaVa processor which wraps a LLaVa image processor and a LLaMa tokenizer into a single processor. + + [`LlavaProcessor`] offers all the functionalities of [`LlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information. + + Args: + image_processor ([`LlavaImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + patch_size (`int`, *optional*): + Patch size from the vision tower. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Should be same as in model's config + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + image_token (`str`, *optional*, defaults to `""`): + Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + patch_size=None, + vision_feature_select_strategy=None, + chat_template=None, + image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + num_additional_image_tokens=0, + **kwargs, + ): + self.patch_size = patch_size + self.num_additional_image_tokens = num_additional_image_tokens + self.vision_feature_select_strategy = vision_feature_select_strategy + self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_token_id = tokenizer.encode(self.image_token, add_special_tokens=False)[0] + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[LlavaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if images is None and text is None: + raise ValueError("You have to specify at least one of `images` or `text`.") + + output_kwargs = self._merge_kwargs( + LlavaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + else: + image_inputs = {} + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise TypeError("Invalid input text. Please provide a string, or a list of strings") + + # try to expand inputs in processing if we have the necessary parts + prompt_strings = text + if image_inputs.get("pixel_values") is not None: + # Replace the image token with the expanded image token sequence + pixel_values = image_inputs["pixel_values"] + height, width = get_image_size(to_numpy_array(pixel_values[0])) + num_image_tokens = (height // self.patch_size) * ( + width // self.patch_size + ) + self.num_additional_image_tokens + if self.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + prompt_strings = [] + for sample in text: + sample = sample.replace(self.image_token, self.image_token * num_image_tokens) + prompt_strings.append(sample) + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = LlavaProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + crop_size = images_kwargs.get("crop_size", None) or self.image_processor.crop_size + resized_height, resized_width = crop_size["height"], crop_size["width"] + + num_image_tokens = (resized_height // self.patch_size) * (resized_width // self.patch_size) + num_image_tokens += self.num_additional_image_tokens + if self.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + num_image_tokens = [num_image_tokens] * len(image_sizes) + num_image_patches = [1] * len(image_sizes) + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + +__all__ = ["LlavaProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..700e26350df81d1d7a2163eef51e8b2da573a87a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/configuration_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/configuration_llava_next.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbf7a4c64f112ee29bc62bd9da22659e9b847213 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/configuration_llava_next.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f17a7aba47c2f2076420246293d03d5293fad303 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d98becc15135d91b98a036a39392f5940a80a1f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/modeling_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/modeling_llava_next.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2728ca85c654ef7fd69d3908e95fe4837f24656 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/modeling_llava_next.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/processing_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/processing_llava_next.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..349b5f6f0e5410c19a225a491e187febfd69fb50 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/processing_llava_next.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6684e9c33036c104dc5f1ec1ec0dc02d87c91719 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/configuration_longcat_flash.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/configuration_longcat_flash.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f12304a64dd0575dc10b3efb46d8b8eb7ad498da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/configuration_longcat_flash.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modeling_longcat_flash.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modeling_longcat_flash.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e54390584863e036558bdfef33d6803216f3a221 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modeling_longcat_flash.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modular_longcat_flash.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modular_longcat_flash.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e70290fef88869d172fdba02dcfecb9cfe5b69d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modular_longcat_flash.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e3365cff6e6847a2b381019c098a48e5c1dd0a5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/configuration_longt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/configuration_longt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af68d5ff07b3ed8bcf04a0e8e4e4680f7ae2b05f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/configuration_longt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..623b56c71c35571e4634547534093921483d8f31 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f938a076d279831a363877b172fbe6fbe2117ca9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ea3a201affe0c1241fc638ba475544b69e37231 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b7effe1fbd5fe6afa973fa3ecb0ba77a41e855a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..019dafc905e6b248b9ef9686c953838aaeb0e2f0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b71c0d72f1e493a0d7c42e9e5d8d6ed8dc309f9d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/markuplm/__pycache__/modeling_markuplm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/markuplm/__pycache__/modeling_markuplm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1cde448aebcdf2a991f0e65debf56e058f85df1a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/markuplm/__pycache__/modeling_markuplm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..536514d32ee704a198b8746bd8ab2b13d4c3df84 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b4d9863803c4e5773c683cccfaeb38c5ca4a873 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer_swin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb923c87938e81456d24bd63d39b03d918541368 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer_swin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/feature_extraction_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/feature_extraction_maskformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f85fa69a40410358b41d5bdd01f9a409e10dffca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/feature_extraction_maskformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0586f6f33d29a462b32e74243448b4396889390c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eeddb15a71cb5dbcee7f6e1677124a274ae5497e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d45911043144c8840f32aaea487196f3d83bb0df Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer_swin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7368cd0b9fe05eaf1266a473d316d11ad85ceb68 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer_swin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5797a1e89776a1de02ee857bc3577f5f0c4d99e5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7fad6458561921cde60468881f3e2298b6ae320 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e12700e5b32be209afd8ba9812084019bc6cccf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c65c038af4f46cfdb545d9e42015cccc8b198f94 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/checkpoint_reshaping_and_interoperability.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/checkpoint_reshaping_and_interoperability.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a84dc60a6bc88feb5777b8e202fd2946f556588c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/checkpoint_reshaping_and_interoperability.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a20c4cd4efc3ee8da6b22d0d4edf3dab25dcfc06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mgp_str import * + from .modeling_mgp_str import * + from .processing_mgp_str import * + from .tokenization_mgp_str import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/configuration_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/configuration_mgp_str.py new file mode 100644 index 0000000000000000000000000000000000000000..1f6b355c7718054f2522a6892f5a3212c4d85ced --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/configuration_mgp_str.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MGP-STR model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class MgpstrConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`MgpstrModel`]. It is used to instantiate an + MGP-STR model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the MGP-STR + [alibaba-damo/mgp-str-base](https://huggingface.co/alibaba-damo/mgp-str-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + image_size (`list[int]`, *optional*, defaults to `[32, 128]`): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + max_token_length (`int`, *optional*, defaults to 27): + The max number of output tokens. + num_character_labels (`int`, *optional*, defaults to 38): + The number of classes for character head . + num_bpe_labels (`int`, *optional*, defaults to 50257): + The number of classes for bpe head . + num_wordpiece_labels (`int`, *optional*, defaults to 30522): + The number of classes for wordpiece head . + hidden_size (`int`, *optional*, defaults to 768): + The embedding dimension. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`float`, *optional*, defaults to 4.0): + The ratio of mlp hidden dim to embedding dim. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + distilled (`bool`, *optional*, defaults to `False`): + Model includes a distillation token and head as in DeiT models. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + drop_rate (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder. + attn_drop_rate (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The stochastic depth rate. + output_a3_attentions (`bool`, *optional*, defaults to `False`): + Whether or not the model should returns A^3 module attentions. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Example: + + ```python + >>> from transformers import MgpstrConfig, MgpstrForSceneTextRecognition + + >>> # Initializing a Mgpstr mgp-str-base style configuration + >>> configuration = MgpstrConfig() + + >>> # Initializing a model (with random weights) from the mgp-str-base style configuration + >>> model = MgpstrForSceneTextRecognition(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mgp-str" + + def __init__( + self, + image_size=[32, 128], + patch_size=4, + num_channels=3, + max_token_length=27, + num_character_labels=38, + num_bpe_labels=50257, + num_wordpiece_labels=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + distilled=False, + layer_norm_eps=1e-5, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + output_a3_attentions=False, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.max_token_length = max_token_length + self.num_character_labels = num_character_labels + self.num_bpe_labels = num_bpe_labels + self.num_wordpiece_labels = num_wordpiece_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.distilled = distilled + self.layer_norm_eps = layer_norm_eps + self.drop_rate = drop_rate + self.qkv_bias = qkv_bias + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.output_a3_attentions = output_a3_attentions + self.initializer_range = initializer_range + + +__all__ = ["MgpstrConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/modeling_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/modeling_mgp_str.py new file mode 100644 index 0000000000000000000000000000000000000000..be7cf08b14ab74b4a4598d4ee557f9388199a3ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/modeling_mgp_str.py @@ -0,0 +1,459 @@ +# coding=utf-8 +# Copyright 2023 Alibaba Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch MGP-STR model.""" + +import collections.abc +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, logging +from .configuration_mgp_str import MgpstrConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Mgpstr +class MgpstrDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + """ +) +class MgpstrModelOutput(ModelOutput): + r""" + logits (`tuple(torch.FloatTensor)` of shape `(batch_size, config.num_character_labels)`): + Tuple of `torch.FloatTensor` (one for the output of character of shape `(batch_size, + config.max_token_length, config.num_character_labels)`, + one for the output of bpe of shape `(batch_size, + config.max_token_length, config.num_bpe_labels)`, + one for the output of wordpiece of shape `(batch_size, + config.max_token_length, config.num_wordpiece_labels)`) . + + Classification scores (before SoftMax) of character, bpe and wordpiece. + a3_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_a3_attentions=True` is passed or when `config.output_a3_attentions=True`): + Tuple of `torch.FloatTensor` (one for the attention of character, + one for the attention of bpe`, + one + for the attention of wordpiece) of shape `(batch_size, config.max_token_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: Optional[tuple[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + a3_attentions: Optional[tuple[torch.FloatTensor]] = None + + +class MgpstrEmbeddings(nn.Module): + """2D Image to Patch Embedding""" + + def __init__(self, config: MgpstrConfig): + super().__init__() + image_size = ( + config.image_size + if isinstance(config.image_size, collections.abc.Iterable) + else (config.image_size, config.image_size) + ) + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + self.image_size = image_size + self.patch_size = patch_size + self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.num_tokens = 2 if config.distilled else 1 + + self.proj = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + + self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + self.num_tokens, config.hidden_size)) + self.pos_drop = nn.Dropout(p=config.drop_rate) + + def forward(self, pixel_values): + batch_size, channel, height, width = pixel_values.shape + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})." + ) + + patch_embeddings = self.proj(pixel_values) + patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2) # BCHW -> BNC + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embedding_output = torch.cat((cls_tokens, patch_embeddings), dim=1) + embedding_output = embedding_output + self.pos_embed + embedding_output = self.pos_drop(embedding_output) + + return embedding_output + + +class MgpstrMlp(nn.Module): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__(self, config: MgpstrConfig, hidden_features): + super().__init__() + hidden_features = hidden_features or config.hidden_size + self.fc1 = nn.Linear(config.hidden_size, hidden_features) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_features, config.hidden_size) + self.drop = nn.Dropout(config.drop_rate) + + def forward(self, hidden_states): + hidden_states = self.fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.drop(hidden_states) + hidden_states = self.fc2(hidden_states) + hidden_states = self.drop(hidden_states) + return hidden_states + + +class MgpstrAttention(nn.Module): + def __init__(self, config: MgpstrConfig): + super().__init__() + self.num_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias) + self.attn_drop = nn.Dropout(config.attn_drop_rate) + self.proj = nn.Linear(config.hidden_size, config.hidden_size) + self.proj_drop = nn.Dropout(config.drop_rate) + + def forward(self, hidden_states): + batch_size, num, channel = hidden_states.shape + qkv = ( + self.qkv(hidden_states) + .reshape(batch_size, num, 3, self.num_heads, channel // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + query, key, value = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + attention_probs = (query @ key.transpose(-2, -1)) * self.scale + attention_probs = attention_probs.softmax(dim=-1) + attention_probs = self.attn_drop(attention_probs) + + context_layer = (attention_probs @ value).transpose(1, 2).reshape(batch_size, num, channel) + context_layer = self.proj(context_layer) + context_layer = self.proj_drop(context_layer) + return (context_layer, attention_probs) + + +class MgpstrLayer(nn.Module): + def __init__(self, config: MgpstrConfig, drop_path=None): + super().__init__() + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attn = MgpstrAttention(config) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = MgpstrDropPath(drop_path) if drop_path is not None else nn.Identity() + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + mlp_hidden_dim = int(config.hidden_size * config.mlp_ratio) + self.mlp = MgpstrMlp(config, mlp_hidden_dim) + + def forward(self, hidden_states): + self_attention_outputs = self.attn(self.norm1(hidden_states)) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1] + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # second residual connection is done here + layer_output = hidden_states + self.drop_path(self.mlp(self.norm2(hidden_states))) + + outputs = (layer_output, outputs) + return outputs + + +class MgpstrEncoder(nn.Module): + def __init__(self, config: MgpstrConfig): + super().__init__() + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")] + + self.blocks = nn.Sequential( + *[MgpstrLayer(config=config, drop_path=dpr[i]) for i in range(config.num_hidden_layers)] + ) + + def forward(self, hidden_states, output_attentions=False, output_hidden_states=False, return_dict=True): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for _, blk in enumerate(self.blocks): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = blk(hidden_states) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class MgpstrA3Module(nn.Module): + def __init__(self, config: MgpstrConfig): + super().__init__() + self.token_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.tokenLearner = nn.Sequential( + nn.Conv2d(config.hidden_size, config.hidden_size, kernel_size=(1, 1), stride=1, groups=8, bias=False), + nn.Conv2d(config.hidden_size, config.max_token_length, kernel_size=(1, 1), stride=1, bias=False), + ) + self.feat = nn.Conv2d( + config.hidden_size, config.hidden_size, kernel_size=(1, 1), stride=1, groups=8, bias=False + ) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.token_norm(hidden_states) + hidden_states = hidden_states.transpose(1, 2).unsqueeze(-1) + selected = self.tokenLearner(hidden_states) + selected = selected.flatten(2) + attentions = F.softmax(selected, dim=-1) + + feat = self.feat(hidden_states) + feat = feat.flatten(2).transpose(1, 2) + feat = torch.einsum("...si,...id->...sd", attentions, feat) + a3_out = self.norm(feat) + + return (a3_out, attentions) + + +@auto_docstring +class MgpstrPreTrainedModel(PreTrainedModel): + config: MgpstrConfig + base_model_prefix = "mgp_str" + _no_split_modules = [] + + def _init_weights(self, module: nn.Module) -> None: + """Initialize the weights""" + std = self.config.initializer_range + if isinstance(module, MgpstrEmbeddings): + nn.init.trunc_normal_(module.pos_embed, mean=0.0, std=std) + nn.init.trunc_normal_(module.cls_token, mean=0.0, std=std) + elif isinstance(module, (nn.Linear, nn.Conv2d)): + nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class MgpstrModel(MgpstrPreTrainedModel): + def __init__(self, config: MgpstrConfig): + super().__init__(config) + self.config = config + self.embeddings = MgpstrEmbeddings(config) + self.encoder = MgpstrEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.proj + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return encoder_outputs + return BaseModelOutput( + last_hidden_state=encoder_outputs.last_hidden_state, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + MGP-STR Model transformer with three classification heads on top (three A^3 modules and three linear layer on top + of the transformer encoder output) for scene text recognition (STR) . + """ +) +class MgpstrForSceneTextRecognition(MgpstrPreTrainedModel): + config: MgpstrConfig + main_input_name = "pixel_values" + + def __init__(self, config: MgpstrConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.mgp_str = MgpstrModel(config) + + self.char_a3_module = MgpstrA3Module(config) + self.bpe_a3_module = MgpstrA3Module(config) + self.wp_a3_module = MgpstrA3Module(config) + + self.char_head = nn.Linear(config.hidden_size, config.num_character_labels) + self.bpe_head = nn.Linear(config.hidden_size, config.num_bpe_labels) + self.wp_head = nn.Linear(config.hidden_size, config.num_wordpiece_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + output_attentions: Optional[bool] = None, + output_a3_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], MgpstrModelOutput]: + r""" + output_a3_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of a3 modules. See `a3_attentions` under returned tensors + for more detail. + + Example: + + ```python + >>> from transformers import ( + ... MgpstrProcessor, + ... MgpstrForSceneTextRecognition, + ... ) + >>> import requests + >>> from PIL import Image + + >>> # load image from the IIIT-5k dataset + >>> url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png" + >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + + >>> processor = MgpstrProcessor.from_pretrained("alibaba-damo/mgp-str-base") + >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values + + >>> model = MgpstrForSceneTextRecognition.from_pretrained("alibaba-damo/mgp-str-base") + + >>> # inference + >>> outputs = model(pixel_values) + >>> out_strs = processor.batch_decode(outputs.logits) + >>> out_strs["generated_text"] + '["ticket"]' + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + mgp_outputs = self.mgp_str( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = mgp_outputs[0] + + char_a3_out, char_attention = self.char_a3_module(sequence_output) + bpe_a3_out, bpe_attention = self.bpe_a3_module(sequence_output) + wp_a3_out, wp_attention = self.wp_a3_module(sequence_output) + + char_logits = self.char_head(char_a3_out) + bpe_logits = self.bpe_head(bpe_a3_out) + wp_logits = self.wp_head(wp_a3_out) + + all_a3_attentions = (char_attention, bpe_attention, wp_attention) if output_a3_attentions else None + all_logits = (char_logits, bpe_logits, wp_logits) + + if not return_dict: + outputs = (all_logits, all_a3_attentions) + mgp_outputs[1:] + return tuple(output for output in outputs if output is not None) + return MgpstrModelOutput( + logits=all_logits, + hidden_states=mgp_outputs.hidden_states, + attentions=mgp_outputs.attentions, + a3_attentions=all_a3_attentions, + ) + + +__all__ = ["MgpstrModel", "MgpstrPreTrainedModel", "MgpstrForSceneTextRecognition"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/processing_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/processing_mgp_str.py new file mode 100644 index 0000000000000000000000000000000000000000..8f7d32388748b2c8d9ea3bbf35edacc464408a69 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/processing_mgp_str.py @@ -0,0 +1,231 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Processor class for MGP-STR.""" + +import warnings + +from transformers import AutoTokenizer +from transformers.utils import is_torch_available +from transformers.utils.generic import ExplicitEnum + +from ...processing_utils import ProcessorMixin +from ...utils.import_utils import requires + + +if is_torch_available(): + import torch + + +class DecodeType(ExplicitEnum): + CHARACTER = "char" + BPE = "bpe" + WORDPIECE = "wp" + + +SUPPORTED_ANNOTATION_FORMATS = (DecodeType.CHARACTER, DecodeType.BPE, DecodeType.WORDPIECE) + + +@requires(backends=("sentencepiece",)) +class MgpstrProcessor(ProcessorMixin): + r""" + Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single + + [`MgpstrProcessor`] offers all the functionalities of `ViTImageProcessor`] and [`MgpstrTokenizer`]. See the + [`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information. + + Args: + image_processor (`ViTImageProcessor`, *optional*): + An instance of `ViTImageProcessor`. The image processor is a required input. + tokenizer ([`MgpstrTokenizer`], *optional*): + The tokenizer is a required input. + """ + + attributes = ["image_processor", "char_tokenizer"] + image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast") + char_tokenizer_class = "MgpstrTokenizer" + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + feature_extractor = None + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + + self.char_tokenizer = tokenizer + self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + self.wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + + super().__init__(image_processor, tokenizer) + + def __call__(self, text=None, images=None, return_tensors=None, **kwargs): + """ + When used in normal mode, this method forwards all its arguments to ViTImageProcessor's + [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs` + arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please + refer to the docstring of the above methods for more information. + """ + if images is None and text is None: + raise ValueError("You need to specify either an `images` or `text` input to process.") + + if images is not None: + inputs = self.image_processor(images, return_tensors=return_tensors, **kwargs) + if text is not None: + encodings = self.char_tokenizer(text, return_tensors=return_tensors, **kwargs) + + if text is None: + return inputs + elif images is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs + + def batch_decode(self, sequences): + """ + Convert a list of lists of token ids into a list of strings by calling decode. + + Args: + sequences (`torch.Tensor`): + List of tokenized input ids. + + Returns: + `dict[str, any]`: Dictionary of all the outputs of the decoded results. + generated_text (`list[str]`): The final results after fusion of char, bpe, and wp. scores + (`list[float]`): The final scores after fusion of char, bpe, and wp. char_preds (`list[str]`): The list + of character decoded sentences. bpe_preds (`list[str]`): The list of bpe decoded sentences. wp_preds + (`list[str]`): The list of wp decoded sentences. + + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + char_preds, bpe_preds, wp_preds = sequences + batch_size = char_preds.size(0) + + char_strs, char_scores = self._decode_helper(char_preds, "char") + bpe_strs, bpe_scores = self._decode_helper(bpe_preds, "bpe") + wp_strs, wp_scores = self._decode_helper(wp_preds, "wp") + + final_strs = [] + final_scores = [] + for i in range(batch_size): + scores = [char_scores[i], bpe_scores[i], wp_scores[i]] + strs = [char_strs[i], bpe_strs[i], wp_strs[i]] + max_score_index = scores.index(max(scores)) + final_strs.append(strs[max_score_index]) + final_scores.append(scores[max_score_index]) + + out = {} + out["generated_text"] = final_strs + out["scores"] = final_scores + out["char_preds"] = char_strs + out["bpe_preds"] = bpe_strs + out["wp_preds"] = wp_strs + return out + + def _decode_helper(self, pred_logits, format): + """ + Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer. + + Args: + pred_logits (`torch.Tensor`): + List of model prediction logits. + format (`Union[DecoderType, str]`): + Type of model prediction. Must be one of ['char', 'bpe', 'wp']. + Returns: + `tuple`: + dec_strs(`str`): The decode strings of model prediction. conf_scores(`list[float]`): The confidence + score of model prediction. + """ + if format == DecodeType.CHARACTER: + decoder = self.char_decode + eos_token = 1 + eos_str = "[s]" + elif format == DecodeType.BPE: + decoder = self.bpe_decode + eos_token = 2 + eos_str = "#" + elif format == DecodeType.WORDPIECE: + decoder = self.wp_decode + eos_token = 102 + eos_str = "[SEP]" + else: + raise ValueError(f"Format {format} is not supported.") + + dec_strs, conf_scores = [], [] + batch_size = pred_logits.size(0) + batch_max_length = pred_logits.size(1) + _, preds_index = pred_logits.topk(1, dim=-1, largest=True, sorted=True) + preds_index = preds_index.view(-1, batch_max_length)[:, 1:] + preds_str = decoder(preds_index) + preds_max_prob, _ = torch.nn.functional.softmax(pred_logits, dim=2).max(dim=2) + preds_max_prob = preds_max_prob[:, 1:] + + for index in range(batch_size): + pred_eos = preds_str[index].find(eos_str) + pred = preds_str[index][:pred_eos] + pred_index = preds_index[index].tolist() + pred_eos_index = pred_index.index(eos_token) if eos_token in pred_index else -1 + pred_max_prob = preds_max_prob[index][: pred_eos_index + 1] + confidence_score = pred_max_prob.cumprod(dim=0)[-1] if pred_max_prob.nelement() != 0 else 0.0 + dec_strs.append(pred) + conf_scores.append(confidence_score) + + return dec_strs, conf_scores + + def char_decode(self, sequences): + """ + Convert a list of lists of char token ids into a list of strings by calling char tokenizer. + + Args: + sequences (`torch.Tensor`): + List of tokenized input ids. + Returns: + `list[str]`: The list of char decoded sentences. + """ + decode_strs = [seq.replace(" ", "") for seq in self.char_tokenizer.batch_decode(sequences)] + return decode_strs + + def bpe_decode(self, sequences): + """ + Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer. + + Args: + sequences (`torch.Tensor`): + List of tokenized input ids. + Returns: + `list[str]`: The list of bpe decoded sentences. + """ + return self.bpe_tokenizer.batch_decode(sequences) + + def wp_decode(self, sequences): + """ + Convert a list of lists of word piece token ids into a list of strings by calling word piece tokenizer. + + Args: + sequences (`torch.Tensor`): + List of tokenized input ids. + Returns: + `list[str]`: The list of wp decoded sentences. + """ + decode_strs = [seq.replace(" ", "") for seq in self.wp_tokenizer.batch_decode(sequences)] + return decode_strs + + +__all__ = ["MgpstrProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/tokenization_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/tokenization_mgp_str.py new file mode 100644 index 0000000000000000000000000000000000000000..f29b8a8348ee3771210df17de79be6c30a65c34a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/tokenization_mgp_str.py @@ -0,0 +1,104 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for MGT-STR CHAR.""" + +import json +import os +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"} + + +class MgpstrTokenizer(PreTrainedTokenizer): + """ + Construct a MGP-STR char tokenizer. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + unk_token (`str`, *optional*, defaults to `"[GO]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `"[GO]"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"[s]"`): + The end of sequence token. + pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"[GO]"`): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs): + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.vocab = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.vocab.items()} + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + vocab = dict(self.vocab).copy() + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Tokenize a string.""" + char_tokens = [] + for s in text: + char_tokens.extend(s) + return char_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + return (vocab_file,) + + +__all__ = ["MgpstrTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17d7eb983f6d3a87babe0bf066fde67d0b476694 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/configuration_minimax.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/configuration_minimax.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e76ab3663b83972dd5895b878f15b759dc52f81b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/configuration_minimax.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modeling_minimax.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modeling_minimax.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abb8cb25c3326a610596975e24b25257f84edab4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modeling_minimax.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modular_minimax.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modular_minimax.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a57b16b27ab3326d780475de25fd05f80b36b1da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modular_minimax.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..803d33bbfaa44b597d3848a95a3b674a413eed10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/configuration_ministral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/configuration_ministral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1673c69c047e194d23257018f062d25b6d14407 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/configuration_ministral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modular_ministral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modular_ministral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df4be675d68f72fb78f061599e27276e82dc9ed3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modular_ministral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdad2afa8c35c2a7d4de612ccbcd69b1658a93f6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37337d52ad7988b649c3abfade43d0d65c5c8d4b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28472af3f089996e67ae1caacd79f5848134dec9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65633a8aa1e152943b8c941bd29dcba878723a58 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..363f0cd34add34fd6ec00f3ec729b33bd19fb8c7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e09efd5674529048b9db54e5cff9b7e376468ab8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bce83216c35546fe01cf96c738cbe9c2f3582486 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mobilenet_v1 import * + from .feature_extraction_mobilenet_v1 import * + from .image_processing_mobilenet_v1 import * + from .image_processing_mobilenet_v1_fast import * + from .modeling_mobilenet_v1 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd21d9a898ec65f6ba7b325cf7acd1933b54402a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/configuration_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/configuration_mobilenet_v1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..506d147772dc443284f286a1aaa0cc47bd5de5d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/configuration_mobilenet_v1.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/feature_extraction_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/feature_extraction_mobilenet_v1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fabf58e7af316c0fb81ec1d754433407d442d412 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/feature_extraction_mobilenet_v1.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb13cfa5c4ed0ee89f526765ec01b31c53ff80b0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94bd6c771f95de7cb74c136c76a550f58116d68c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/modeling_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/modeling_mobilenet_v1.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e521352dddfcd5e4669a2af0e0ab1fb8f158f728 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/modeling_mobilenet_v1.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..c18d72d55003dc88e8fbcfe2ee80a18975d23dbc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py @@ -0,0 +1,126 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MobileNetV1 model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class MobileNetV1Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MobileNetV1Model`]. It is used to instantiate a + MobileNetV1 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MobileNetV1 + [google/mobilenet_v1_1.0_224](https://huggingface.co/google/mobilenet_v1_1.0_224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + depth_multiplier (`float`, *optional*, defaults to 1.0): + Shrinks or expands the number of channels in each layer. Default is 1.0, which starts the network with 32 + channels. This is sometimes also called "alpha" or "width multiplier". + min_depth (`int`, *optional*, defaults to 8): + All layers will have at least this many channels. + hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`): + The non-linear activation function (function or string) in the Transformer encoder and convolution layers. + tf_padding (`bool`, *optional*, defaults to `True`): + Whether to use TensorFlow padding rules on the convolution layers. + classifier_dropout_prob (`float`, *optional*, defaults to 0.999): + The dropout ratio for attached classifiers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 0.001): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import MobileNetV1Config, MobileNetV1Model + + >>> # Initializing a "mobilenet_v1_1.0_224" style configuration + >>> configuration = MobileNetV1Config() + + >>> # Initializing a model from the "mobilenet_v1_1.0_224" style configuration + >>> model = MobileNetV1Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mobilenet_v1" + + def __init__( + self, + num_channels=3, + image_size=224, + depth_multiplier=1.0, + min_depth=8, + hidden_act="relu6", + tf_padding=True, + classifier_dropout_prob=0.999, + initializer_range=0.02, + layer_norm_eps=0.001, + **kwargs, + ): + super().__init__(**kwargs) + + if depth_multiplier <= 0: + raise ValueError("depth_multiplier must be greater than zero.") + + self.num_channels = num_channels + self.image_size = image_size + self.depth_multiplier = depth_multiplier + self.min_depth = min_depth + self.hidden_act = hidden_act + self.tf_padding = tf_padding + self.classifier_dropout_prob = classifier_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + +class MobileNetV1OnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict([("pixel_values", {0: "batch"})]) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "image-classification": + return OrderedDict([("logits", {0: "batch"})]) + else: + return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})]) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + +__all__ = ["MobileNetV1Config", "MobileNetV1OnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..25997a46790c009bc62e5b4323d378386e3c1718 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py @@ -0,0 +1,414 @@ +# coding=utf-8 +# Copyright 2022 Apple Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch MobileNetV1 model.""" + +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from .configuration_mobilenet_v1 import MobileNetV1Config + + +logger = logging.get_logger(__name__) + + +def _build_tf_to_pytorch_map(model, config, tf_weights=None): + """ + A map of modules from TF to PyTorch. + """ + + tf_to_pt_map = {} + + if isinstance(model, MobileNetV1ForImageClassification): + backbone = model.mobilenet_v1 + else: + backbone = model + + prefix = "MobilenetV1/Conv2d_0/" + tf_to_pt_map[prefix + "weights"] = backbone.conv_stem.convolution.weight + tf_to_pt_map[prefix + "BatchNorm/beta"] = backbone.conv_stem.normalization.bias + tf_to_pt_map[prefix + "BatchNorm/gamma"] = backbone.conv_stem.normalization.weight + tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.normalization.running_mean + tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.normalization.running_var + + for i in range(13): + tf_index = i + 1 + pt_index = i * 2 + + pointer = backbone.layer[pt_index] + prefix = f"MobilenetV1/Conv2d_{tf_index}_depthwise/" + tf_to_pt_map[prefix + "depthwise_weights"] = pointer.convolution.weight + tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias + tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight + tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean + tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var + + pointer = backbone.layer[pt_index + 1] + prefix = f"MobilenetV1/Conv2d_{tf_index}_pointwise/" + tf_to_pt_map[prefix + "weights"] = pointer.convolution.weight + tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias + tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight + tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean + tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var + + if isinstance(model, MobileNetV1ForImageClassification): + prefix = "MobilenetV1/Logits/Conv2d_1c_1x1/" + tf_to_pt_map[prefix + "weights"] = model.classifier.weight + tf_to_pt_map[prefix + "biases"] = model.classifier.bias + + return tf_to_pt_map + + +def load_tf_weights_in_mobilenet_v1(model, config, tf_checkpoint_path): + """Load TensorFlow checkpoints in a PyTorch model.""" + try: + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + + # Load weights from TF model + init_vars = tf.train.list_variables(tf_checkpoint_path) + tf_weights = {} + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_checkpoint_path, name) + tf_weights[name] = array + + # Build TF to PyTorch weights loading map + tf_to_pt_map = _build_tf_to_pytorch_map(model, config, tf_weights) + + for name, pointer in tf_to_pt_map.items(): + logger.info(f"Importing {name}") + if name not in tf_weights: + logger.info(f"{name} not in tf pre-trained weights, skipping") + continue + + array = tf_weights[name] + + if "depthwise_weights" in name: + logger.info("Transposing depthwise") + array = np.transpose(array, (2, 3, 0, 1)) + elif "weights" in name: + logger.info("Transposing") + if len(pointer.shape) == 2: # copying into linear layer + array = array.squeeze().transpose() + else: + array = np.transpose(array, (3, 2, 0, 1)) + + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + + logger.info(f"Initialize PyTorch weight {name} {array.shape}") + pointer.data = torch.from_numpy(array) + + tf_weights.pop(name, None) + tf_weights.pop(name + "/RMSProp", None) + tf_weights.pop(name + "/RMSProp_1", None) + tf_weights.pop(name + "/ExponentialMovingAverage", None) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}") + return model + + +def apply_tf_padding(features: torch.Tensor, conv_layer: nn.Conv2d) -> torch.Tensor: + """ + Apply TensorFlow-style "SAME" padding to a convolution layer. See the notes at: + https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2 + """ + in_height, in_width = features.shape[-2:] + stride_height, stride_width = conv_layer.stride + kernel_height, kernel_width = conv_layer.kernel_size + + if in_height % stride_height == 0: + pad_along_height = max(kernel_height - stride_height, 0) + else: + pad_along_height = max(kernel_height - (in_height % stride_height), 0) + + if in_width % stride_width == 0: + pad_along_width = max(kernel_width - stride_width, 0) + else: + pad_along_width = max(kernel_width - (in_width % stride_width), 0) + + pad_left = pad_along_width // 2 + pad_right = pad_along_width - pad_left + pad_top = pad_along_height // 2 + pad_bottom = pad_along_height - pad_top + + padding = (pad_left, pad_right, pad_top, pad_bottom) + return nn.functional.pad(features, padding, "constant", 0.0) + + +class MobileNetV1ConvLayer(nn.Module): + def __init__( + self, + config: MobileNetV1Config, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: Optional[int] = 1, + groups: Optional[int] = 1, + bias: bool = False, + use_normalization: Optional[bool] = True, + use_activation: Optional[Union[bool, str]] = True, + ) -> None: + super().__init__() + self.config = config + + if in_channels % groups != 0: + raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.") + if out_channels % groups != 0: + raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.") + + padding = 0 if config.tf_padding else int((kernel_size - 1) / 2) + + self.convolution = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=bias, + padding_mode="zeros", + ) + + if use_normalization: + self.normalization = nn.BatchNorm2d( + num_features=out_channels, + eps=config.layer_norm_eps, + momentum=0.9997, + affine=True, + track_running_stats=True, + ) + else: + self.normalization = None + + if use_activation: + if isinstance(use_activation, str): + self.activation = ACT2FN[use_activation] + elif isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + else: + self.activation = None + + def forward(self, features: torch.Tensor) -> torch.Tensor: + if self.config.tf_padding: + features = apply_tf_padding(features, self.convolution) + features = self.convolution(features) + if self.normalization is not None: + features = self.normalization(features) + if self.activation is not None: + features = self.activation(features) + return features + + +@auto_docstring +class MobileNetV1PreTrainedModel(PreTrainedModel): + config: MobileNetV1Config + load_tf_weights = load_tf_weights_in_mobilenet_v1 + base_model_prefix = "mobilenet_v1" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + _no_split_modules = [] + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.BatchNorm2d): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class MobileNetV1Model(MobileNetV1PreTrainedModel): + def __init__(self, config: MobileNetV1Config, add_pooling_layer: bool = True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + depth = 32 + out_channels = max(int(depth * config.depth_multiplier), config.min_depth) + + self.conv_stem = MobileNetV1ConvLayer( + config, + in_channels=config.num_channels, + out_channels=out_channels, + kernel_size=3, + stride=2, + ) + + strides = [1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1] + + self.layer = nn.ModuleList() + for i in range(13): + in_channels = out_channels + + if strides[i] == 2 or i == 0: + depth *= 2 + out_channels = max(int(depth * config.depth_multiplier), config.min_depth) + + self.layer.append( + MobileNetV1ConvLayer( + config, + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=strides[i], + groups=in_channels, + ) + ) + + self.layer.append( + MobileNetV1ConvLayer( + config, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + ) + ) + + self.pooler = nn.AdaptiveAvgPool2d((1, 1)) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def _prune_heads(self, heads_to_prune): + raise NotImplementedError + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.conv_stem(pixel_values) + + all_hidden_states = () if output_hidden_states else None + + for i, layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + last_hidden_state = hidden_states + + if self.pooler is not None: + pooled_output = torch.flatten(self.pooler(last_hidden_state), start_dim=1) + else: + pooled_output = None + + if not return_dict: + return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None) + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=all_hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """ +) +class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel): + def __init__(self, config: MobileNetV1Config) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.mobilenet_v1 = MobileNetV1Model(config) + + last_hidden_size = self.mobilenet_v1.layer[-1].convolution.out_channels + + # Classifier head + self.dropout = nn.Dropout(config.classifier_dropout_prob, inplace=True) + self.classifier = nn.Linear(last_hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutputWithNoAttention]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mobilenet_v1(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(self.dropout(pooled_output)) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutputWithNoAttention( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) + + +__all__ = [ + "MobileNetV1ForImageClassification", + "MobileNetV1Model", + "MobileNetV1PreTrainedModel", + "load_tf_weights_in_mobilenet_v1", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36a4229b4a0068a3293e4e11b12fd31551567623 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/configuration_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/configuration_mobilenet_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1c6109fd2023d8dcaeb013d67c01b6ec3ba9c69 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/configuration_mobilenet_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/feature_extraction_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/feature_extraction_mobilenet_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..873e73b7f99f2b2bbd69da7d7312b98f70ba7ee6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/feature_extraction_mobilenet_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16c6309c3e759170330184ce1fcc440ff20a4dcd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..714e6ad0ed29c3b837fcb011d2c393dc489bcae1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/modeling_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/modeling_mobilenet_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66edc800a400e438ebcd5551d5aaa4a26dcbf6cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/modeling_mobilenet_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..e36b50cffaada3293d1dada34f06022f3ecf8163 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for MobileNetV2.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_mobilenet_v2 import MobileNetV2ImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class MobileNetV2FeatureExtractor(MobileNetV2ImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class MobileNetV2FeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use MobileNetV2ImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["MobileNetV2FeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..15e10a1d9815935629fadf9405effdae5a16fbb2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mobilevitv2 import * + from .modeling_mobilevitv2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a1bbb4fb72d73b2c5bc4a4305b24a9e1346cfb8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/configuration_mobilevitv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/configuration_mobilevitv2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..029cd2f974327a28a2bc390d8f527d286bfc6a35 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/configuration_mobilevitv2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/modeling_mobilevitv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/modeling_mobilevitv2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26870c71541e38ca83037e212be00433d5c4481c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/modeling_mobilevitv2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/configuration_mobilevitv2.py new file mode 100644 index 0000000000000000000000000000000000000000..dc9db247602c77daa35c3c3ad97dbda10e881cc8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/configuration_mobilevitv2.py @@ -0,0 +1,168 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MobileViTV2 model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class MobileViTV2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MobileViTV2Model`]. It is used to instantiate a + MobileViTV2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MobileViTV2 + [apple/mobilevitv2-1.0](https://huggingface.co/apple/mobilevitv2-1.0) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 256): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 2): + The size (resolution) of each patch. + expand_ratio (`float`, *optional*, defaults to 2.0): + Expansion factor for the MobileNetv2 layers. + hidden_act (`str` or `function`, *optional*, defaults to `"swish"`): + The non-linear activation function (function or string) in the Transformer encoder and convolution layers. + conv_kernel_size (`int`, *optional*, defaults to 3): + The size of the convolutional kernel in the MobileViTV2 layer. + output_stride (`int`, *optional*, defaults to 32): + The ratio of the spatial resolution of the output to the resolution of the input image. + classifier_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for attached classifiers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + aspp_out_channels (`int`, *optional*, defaults to 512): + Number of output channels used in the ASPP layer for semantic segmentation. + atrous_rates (`list[int]`, *optional*, defaults to `[6, 12, 18]`): + Dilation (atrous) factors used in the ASPP layer for semantic segmentation. + aspp_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the ASPP layer for semantic segmentation. + semantic_loss_ignore_index (`int`, *optional*, defaults to 255): + The index that is ignored by the loss function of the semantic segmentation model. + n_attn_blocks (`list[int]`, *optional*, defaults to `[2, 4, 3]`): + The number of attention blocks in each MobileViTV2Layer + base_attn_unit_dims (`list[int]`, *optional*, defaults to `[128, 192, 256]`): + The base multiplier for dimensions of attention blocks in each MobileViTV2Layer + width_multiplier (`float`, *optional*, defaults to 1.0): + The width multiplier for MobileViTV2. + ffn_multiplier (`int`, *optional*, defaults to 2): + The FFN multiplier for MobileViTV2. + attn_dropout (`float`, *optional*, defaults to 0.0): + The dropout in the attention layer. + ffn_dropout (`float`, *optional*, defaults to 0.0): + The dropout between FFN layers. + + Example: + + ```python + >>> from transformers import MobileViTV2Config, MobileViTV2Model + + >>> # Initializing a mobilevitv2-small style configuration + >>> configuration = MobileViTV2Config() + + >>> # Initializing a model from the mobilevitv2-small style configuration + >>> model = MobileViTV2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mobilevitv2" + + def __init__( + self, + num_channels=3, + image_size=256, + patch_size=2, + expand_ratio=2.0, + hidden_act="swish", + conv_kernel_size=3, + output_stride=32, + classifier_dropout_prob=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + aspp_out_channels=512, + atrous_rates=[6, 12, 18], + aspp_dropout_prob=0.1, + semantic_loss_ignore_index=255, + n_attn_blocks=[2, 4, 3], + base_attn_unit_dims=[128, 192, 256], + width_multiplier=1.0, + ffn_multiplier=2, + attn_dropout=0.0, + ffn_dropout=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.expand_ratio = expand_ratio + self.hidden_act = hidden_act + self.conv_kernel_size = conv_kernel_size + self.output_stride = output_stride + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.n_attn_blocks = n_attn_blocks + self.base_attn_unit_dims = base_attn_unit_dims + self.width_multiplier = width_multiplier + self.ffn_multiplier = ffn_multiplier + self.ffn_dropout = ffn_dropout + self.attn_dropout = attn_dropout + self.classifier_dropout_prob = classifier_dropout_prob + + # decode head attributes for semantic segmentation + self.aspp_out_channels = aspp_out_channels + self.atrous_rates = atrous_rates + self.aspp_dropout_prob = aspp_dropout_prob + self.semantic_loss_ignore_index = semantic_loss_ignore_index + + +class MobileViTV2OnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})]) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "image-classification": + return OrderedDict([("logits", {0: "batch"})]) + else: + return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})]) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + +__all__ = ["MobileViTV2Config", "MobileViTV2OnnxConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.py new file mode 100644 index 0000000000000000000000000000000000000000..4e0e972a648ad028f37d4a5928e3c4f178fa9b27 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.py @@ -0,0 +1,947 @@ +# coding=utf-8 +# Copyright 2023 Apple Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE +"""PyTorch MobileViTV2 model.""" + +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithNoAttention, + BaseModelOutputWithPoolingAndNoAttention, + ImageClassifierOutputWithNoAttention, + SemanticSegmenterOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from .configuration_mobilevitv2 import MobileViTV2Config + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.make_divisible +def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int: + """ + Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the + original TensorFlow repo. It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + """ + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_value < 0.9 * value: + new_value += divisor + return int(new_value) + + +def clip(value: float, min_val: float = float("-inf"), max_val: float = float("inf")) -> float: + return max(min_val, min(max_val, value)) + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTConvLayer with MobileViT->MobileViTV2 +class MobileViTV2ConvLayer(nn.Module): + def __init__( + self, + config: MobileViTV2Config, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + groups: int = 1, + bias: bool = False, + dilation: int = 1, + use_normalization: bool = True, + use_activation: Union[bool, str] = True, + ) -> None: + super().__init__() + padding = int((kernel_size - 1) / 2) * dilation + + if in_channels % groups != 0: + raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.") + if out_channels % groups != 0: + raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.") + + self.convolution = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode="zeros", + ) + + if use_normalization: + self.normalization = nn.BatchNorm2d( + num_features=out_channels, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + ) + else: + self.normalization = None + + if use_activation: + if isinstance(use_activation, str): + self.activation = ACT2FN[use_activation] + elif isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + else: + self.activation = None + + def forward(self, features: torch.Tensor) -> torch.Tensor: + features = self.convolution(features) + if self.normalization is not None: + features = self.normalization(features) + if self.activation is not None: + features = self.activation(features) + return features + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTInvertedResidual with MobileViT->MobileViTV2 +class MobileViTV2InvertedResidual(nn.Module): + """ + Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381 + """ + + def __init__( + self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1 + ) -> None: + super().__init__() + expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8) + + if stride not in [1, 2]: + raise ValueError(f"Invalid stride {stride}.") + + self.use_residual = (stride == 1) and (in_channels == out_channels) + + self.expand_1x1 = MobileViTV2ConvLayer( + config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1 + ) + + self.conv_3x3 = MobileViTV2ConvLayer( + config, + in_channels=expanded_channels, + out_channels=expanded_channels, + kernel_size=3, + stride=stride, + groups=expanded_channels, + dilation=dilation, + ) + + self.reduce_1x1 = MobileViTV2ConvLayer( + config, + in_channels=expanded_channels, + out_channels=out_channels, + kernel_size=1, + use_activation=False, + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + residual = features + + features = self.expand_1x1(features) + features = self.conv_3x3(features) + features = self.reduce_1x1(features) + + return residual + features if self.use_residual else features + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTMobileNetLayer with MobileViT->MobileViTV2 +class MobileViTV2MobileNetLayer(nn.Module): + def __init__( + self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1 + ) -> None: + super().__init__() + + self.layer = nn.ModuleList() + for i in range(num_stages): + layer = MobileViTV2InvertedResidual( + config, + in_channels=in_channels, + out_channels=out_channels, + stride=stride if i == 0 else 1, + ) + self.layer.append(layer) + in_channels = out_channels + + def forward(self, features: torch.Tensor) -> torch.Tensor: + for layer_module in self.layer: + features = layer_module(features) + return features + + +class MobileViTV2LinearSelfAttention(nn.Module): + """ + This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper: + https://huggingface.co/papers/2206.02680 + + Args: + config (`MobileVitv2Config`): + Model configuration object + embed_dim (`int`): + `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)` + """ + + def __init__(self, config: MobileViTV2Config, embed_dim: int) -> None: + super().__init__() + + self.qkv_proj = MobileViTV2ConvLayer( + config=config, + in_channels=embed_dim, + out_channels=1 + (2 * embed_dim), + bias=True, + kernel_size=1, + use_normalization=False, + use_activation=False, + ) + + self.attn_dropout = nn.Dropout(p=config.attn_dropout) + self.out_proj = MobileViTV2ConvLayer( + config=config, + in_channels=embed_dim, + out_channels=embed_dim, + bias=True, + kernel_size=1, + use_normalization=False, + use_activation=False, + ) + self.embed_dim = embed_dim + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # (batch_size, embed_dim, num_pixels_in_patch, num_patches) --> (batch_size, 1+2*embed_dim, num_pixels_in_patch, num_patches) + qkv = self.qkv_proj(hidden_states) + + # Project hidden_states into query, key and value + # Query --> [batch_size, 1, num_pixels_in_patch, num_patches] + # value, key --> [batch_size, embed_dim, num_pixels_in_patch, num_patches] + query, key, value = torch.split(qkv, split_size_or_sections=[1, self.embed_dim, self.embed_dim], dim=1) + + # apply softmax along num_patches dimension + context_scores = torch.nn.functional.softmax(query, dim=-1) + context_scores = self.attn_dropout(context_scores) + + # Compute context vector + # [batch_size, embed_dim, num_pixels_in_patch, num_patches] x [batch_size, 1, num_pixels_in_patch, num_patches] -> [batch_size, embed_dim, num_pixels_in_patch, num_patches] + context_vector = key * context_scores + # [batch_size, embed_dim, num_pixels_in_patch, num_patches] --> [batch_size, embed_dim, num_pixels_in_patch, 1] + context_vector = torch.sum(context_vector, dim=-1, keepdim=True) + + # combine context vector with values + # [batch_size, embed_dim, num_pixels_in_patch, num_patches] * [batch_size, embed_dim, num_pixels_in_patch, 1] --> [batch_size, embed_dim, num_pixels_in_patch, num_patches] + out = torch.nn.functional.relu(value) * context_vector.expand_as(value) + out = self.out_proj(out) + return out + + +class MobileViTV2FFN(nn.Module): + def __init__( + self, + config: MobileViTV2Config, + embed_dim: int, + ffn_latent_dim: int, + ffn_dropout: float = 0.0, + ) -> None: + super().__init__() + self.conv1 = MobileViTV2ConvLayer( + config=config, + in_channels=embed_dim, + out_channels=ffn_latent_dim, + kernel_size=1, + stride=1, + bias=True, + use_normalization=False, + use_activation=True, + ) + self.dropout1 = nn.Dropout(ffn_dropout) + + self.conv2 = MobileViTV2ConvLayer( + config=config, + in_channels=ffn_latent_dim, + out_channels=embed_dim, + kernel_size=1, + stride=1, + bias=True, + use_normalization=False, + use_activation=False, + ) + self.dropout2 = nn.Dropout(ffn_dropout) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.conv1(hidden_states) + hidden_states = self.dropout1(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = self.dropout2(hidden_states) + return hidden_states + + +class MobileViTV2TransformerLayer(nn.Module): + def __init__( + self, + config: MobileViTV2Config, + embed_dim: int, + ffn_latent_dim: int, + dropout: float = 0.0, + ) -> None: + super().__init__() + self.layernorm_before = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps) + self.attention = MobileViTV2LinearSelfAttention(config, embed_dim) + self.dropout1 = nn.Dropout(p=dropout) + self.layernorm_after = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps) + self.ffn = MobileViTV2FFN(config, embed_dim, ffn_latent_dim, config.ffn_dropout) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + layernorm_1_out = self.layernorm_before(hidden_states) + attention_output = self.attention(layernorm_1_out) + hidden_states = attention_output + hidden_states + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.ffn(layer_output) + + layer_output = layer_output + hidden_states + return layer_output + + +class MobileViTV2Transformer(nn.Module): + def __init__(self, config: MobileViTV2Config, n_layers: int, d_model: int) -> None: + super().__init__() + + ffn_multiplier = config.ffn_multiplier + + ffn_dims = [ffn_multiplier * d_model] * n_layers + + # ensure that dims are multiple of 16 + ffn_dims = [int((d // 16) * 16) for d in ffn_dims] + + self.layer = nn.ModuleList() + for block_idx in range(n_layers): + transformer_layer = MobileViTV2TransformerLayer( + config, embed_dim=d_model, ffn_latent_dim=ffn_dims[block_idx] + ) + self.layer.append(transformer_layer) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for layer_module in self.layer: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class MobileViTV2Layer(GradientCheckpointingLayer): + """ + MobileViTV2 layer: https://huggingface.co/papers/2206.02680 + """ + + def __init__( + self, + config: MobileViTV2Config, + in_channels: int, + out_channels: int, + attn_unit_dim: int, + n_attn_blocks: int = 2, + dilation: int = 1, + stride: int = 2, + ) -> None: + super().__init__() + self.patch_width = config.patch_size + self.patch_height = config.patch_size + + cnn_out_dim = attn_unit_dim + + if stride == 2: + self.downsampling_layer = MobileViTV2InvertedResidual( + config, + in_channels=in_channels, + out_channels=out_channels, + stride=stride if dilation == 1 else 1, + dilation=dilation // 2 if dilation > 1 else 1, + ) + in_channels = out_channels + else: + self.downsampling_layer = None + + # Local representations + self.conv_kxk = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=in_channels, + kernel_size=config.conv_kernel_size, + groups=in_channels, + ) + self.conv_1x1 = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=cnn_out_dim, + kernel_size=1, + use_normalization=False, + use_activation=False, + ) + + # Global representations + self.transformer = MobileViTV2Transformer(config, d_model=attn_unit_dim, n_layers=n_attn_blocks) + + # self.layernorm = MobileViTV2LayerNorm2D(attn_unit_dim, eps=config.layer_norm_eps) + self.layernorm = nn.GroupNorm(num_groups=1, num_channels=attn_unit_dim, eps=config.layer_norm_eps) + + # Fusion + self.conv_projection = MobileViTV2ConvLayer( + config, + in_channels=cnn_out_dim, + out_channels=in_channels, + kernel_size=1, + use_normalization=True, + use_activation=False, + ) + + def unfolding(self, feature_map: torch.Tensor) -> tuple[torch.Tensor, tuple[int, int]]: + batch_size, in_channels, img_height, img_width = feature_map.shape + patches = nn.functional.unfold( + feature_map, + kernel_size=(self.patch_height, self.patch_width), + stride=(self.patch_height, self.patch_width), + ) + patches = patches.reshape(batch_size, in_channels, self.patch_height * self.patch_width, -1) + + return patches, (img_height, img_width) + + def folding(self, patches: torch.Tensor, output_size: tuple[int, int]) -> torch.Tensor: + batch_size, in_dim, patch_size, n_patches = patches.shape + patches = patches.reshape(batch_size, in_dim * patch_size, n_patches) + + feature_map = nn.functional.fold( + patches, + output_size=output_size, + kernel_size=(self.patch_height, self.patch_width), + stride=(self.patch_height, self.patch_width), + ) + + return feature_map + + def forward(self, features: torch.Tensor) -> torch.Tensor: + # reduce spatial dimensions if needed + if self.downsampling_layer: + features = self.downsampling_layer(features) + + # local representation + features = self.conv_kxk(features) + features = self.conv_1x1(features) + + # convert feature map to patches + patches, output_size = self.unfolding(features) + + # learn global representations + patches = self.transformer(patches) + patches = self.layernorm(patches) + + # convert patches back to feature maps + # [batch_size, patch_height, patch_width, input_dim] --> [batch_size, input_dim, patch_height, patch_width] + features = self.folding(patches, output_size) + + features = self.conv_projection(features) + return features + + +class MobileViTV2Encoder(nn.Module): + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__() + self.config = config + + self.layer = nn.ModuleList() + self.gradient_checkpointing = False + + # segmentation architectures like DeepLab and PSPNet modify the strides + # of the classification backbones + dilate_layer_4 = dilate_layer_5 = False + if config.output_stride == 8: + dilate_layer_4 = True + dilate_layer_5 = True + elif config.output_stride == 16: + dilate_layer_5 = True + + dilation = 1 + + layer_0_dim = make_divisible( + clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16 + ) + + layer_1_dim = make_divisible(64 * config.width_multiplier, divisor=16) + layer_2_dim = make_divisible(128 * config.width_multiplier, divisor=8) + layer_3_dim = make_divisible(256 * config.width_multiplier, divisor=8) + layer_4_dim = make_divisible(384 * config.width_multiplier, divisor=8) + layer_5_dim = make_divisible(512 * config.width_multiplier, divisor=8) + + layer_1 = MobileViTV2MobileNetLayer( + config, + in_channels=layer_0_dim, + out_channels=layer_1_dim, + stride=1, + num_stages=1, + ) + self.layer.append(layer_1) + + layer_2 = MobileViTV2MobileNetLayer( + config, + in_channels=layer_1_dim, + out_channels=layer_2_dim, + stride=2, + num_stages=2, + ) + self.layer.append(layer_2) + + layer_3 = MobileViTV2Layer( + config, + in_channels=layer_2_dim, + out_channels=layer_3_dim, + attn_unit_dim=make_divisible(config.base_attn_unit_dims[0] * config.width_multiplier, divisor=8), + n_attn_blocks=config.n_attn_blocks[0], + ) + self.layer.append(layer_3) + + if dilate_layer_4: + dilation *= 2 + + layer_4 = MobileViTV2Layer( + config, + in_channels=layer_3_dim, + out_channels=layer_4_dim, + attn_unit_dim=make_divisible(config.base_attn_unit_dims[1] * config.width_multiplier, divisor=8), + n_attn_blocks=config.n_attn_blocks[1], + dilation=dilation, + ) + self.layer.append(layer_4) + + if dilate_layer_5: + dilation *= 2 + + layer_5 = MobileViTV2Layer( + config, + in_channels=layer_4_dim, + out_channels=layer_5_dim, + attn_unit_dim=make_divisible(config.base_attn_unit_dims[2] * config.width_multiplier, divisor=8), + n_attn_blocks=config.n_attn_blocks[2], + dilation=dilation, + ) + self.layer.append(layer_5) + + def forward( + self, + hidden_states: torch.Tensor, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutputWithNoAttention]: + all_hidden_states = () if output_hidden_states else None + + for i, layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + + +@auto_docstring +class MobileViTV2PreTrainedModel(PreTrainedModel): + config: MobileViTV2Config + base_model_prefix = "mobilevitv2" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["MobileViTV2Layer"] + + def _init_weights(self, module: nn.Module) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.GroupNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class MobileViTV2Model(MobileViTV2PreTrainedModel): + def __init__(self, config: MobileViTV2Config, expand_output: bool = True): + r""" + expand_output (`bool`, *optional*, defaults to `True`): + Whether to expand the output of the model. If `True`, the model will output pooled features in addition to + hidden states. If `False`, only the hidden states will be returned. + """ + super().__init__(config) + self.config = config + self.expand_output = expand_output + + layer_0_dim = make_divisible( + clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16 + ) + + self.conv_stem = MobileViTV2ConvLayer( + config, + in_channels=config.num_channels, + out_channels=layer_0_dim, + kernel_size=3, + stride=2, + use_normalization=True, + use_activation=True, + ) + self.encoder = MobileViTV2Encoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def _prune_heads(self, heads_to_prune): + """Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel + """ + for layer_index, heads in heads_to_prune.items(): + mobilevitv2_layer = self.encoder.layer[layer_index] + if isinstance(mobilevitv2_layer, MobileViTV2Layer): + for transformer_layer in mobilevitv2_layer.transformer.layer: + transformer_layer.attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.conv_stem(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.expand_output: + last_hidden_state = encoder_outputs[0] + + # global average pooling: (batch_size, channels, height, width) -> (batch_size, channels) + pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False) + else: + last_hidden_state = encoder_outputs[0] + pooled_output = None + + if not return_dict: + output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,) + return output + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """ +) +class MobileViTV2ForImageClassification(MobileViTV2PreTrainedModel): + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.mobilevitv2 = MobileViTV2Model(config) + + out_channels = make_divisible(512 * config.width_multiplier, divisor=8) # layer 5 output dimension + # Classifier head + self.classifier = ( + nn.Linear(in_features=out_channels, out_features=config.num_labels) + if config.num_labels > 0 + else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutputWithNoAttention]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutputWithNoAttention( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTASPPPooling with MobileViT->MobileViTV2 +class MobileViTV2ASPPPooling(nn.Module): + def __init__(self, config: MobileViTV2Config, in_channels: int, out_channels: int) -> None: + super().__init__() + + self.global_pool = nn.AdaptiveAvgPool2d(output_size=1) + + self.conv_1x1 = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_normalization=True, + use_activation="relu", + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + spatial_size = features.shape[-2:] + features = self.global_pool(features) + features = self.conv_1x1(features) + features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False) + return features + + +class MobileViTV2ASPP(nn.Module): + """ + ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587 + """ + + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__() + + encoder_out_channels = make_divisible(512 * config.width_multiplier, divisor=8) # layer 5 output dimension + in_channels = encoder_out_channels + out_channels = config.aspp_out_channels + + if len(config.atrous_rates) != 3: + raise ValueError("Expected 3 values for atrous_rates") + + self.convs = nn.ModuleList() + + in_projection = MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + use_activation="relu", + ) + self.convs.append(in_projection) + + self.convs.extend( + [ + MobileViTV2ConvLayer( + config, + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + dilation=rate, + use_activation="relu", + ) + for rate in config.atrous_rates + ] + ) + + pool_layer = MobileViTV2ASPPPooling(config, in_channels, out_channels) + self.convs.append(pool_layer) + + self.project = MobileViTV2ConvLayer( + config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu" + ) + + self.dropout = nn.Dropout(p=config.aspp_dropout_prob) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + pyramid = [] + for conv in self.convs: + pyramid.append(conv(features)) + pyramid = torch.cat(pyramid, dim=1) + + pooled_features = self.project(pyramid) + pooled_features = self.dropout(pooled_features) + return pooled_features + + +# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTDeepLabV3 with MobileViT->MobileViTV2 +class MobileViTV2DeepLabV3(nn.Module): + """ + DeepLabv3 architecture: https://huggingface.co/papers/1706.05587 + """ + + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__() + self.aspp = MobileViTV2ASPP(config) + + self.dropout = nn.Dropout2d(config.classifier_dropout_prob) + + self.classifier = MobileViTV2ConvLayer( + config, + in_channels=config.aspp_out_channels, + out_channels=config.num_labels, + kernel_size=1, + use_normalization=False, + use_activation=False, + bias=True, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + features = self.aspp(hidden_states[-1]) + features = self.dropout(features) + features = self.classifier(features) + return features + + +@auto_docstring( + custom_intro=""" + MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC. + """ +) +class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel): + def __init__(self, config: MobileViTV2Config) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.mobilevitv2 = MobileViTV2Model(config, expand_output=False) + self.segmentation_head = MobileViTV2DeepLabV3(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SemanticSegmenterOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + + Examples: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256") + >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # logits are of shape (batch_size, num_labels, height, width) + >>> logits = outputs.logits + ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None and self.config.num_labels == 1: + raise ValueError("The number of labels should be greater than one") + + outputs = self.mobilevitv2( + pixel_values, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + + logits = self.segmentation_head(encoder_hidden_states) + + loss = None + if labels is not None: + # upsample logits to the images' original size + upsampled_logits = nn.functional.interpolate( + logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) + loss = loss_fct(upsampled_logits, labels) + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[1:] + else: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=None, + ) + + +__all__ = [ + "MobileViTV2ForImageClassification", + "MobileViTV2ForSemanticSegmentation", + "MobileViTV2Model", + "MobileViTV2PreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5b33ef6ade8cf12d1af4e220d74cdb89bc10c8f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/configuration_moonshine.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/configuration_moonshine.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee533499880562a619f1f932d7f135e69cfadce6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/configuration_moonshine.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modeling_moonshine.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modeling_moonshine.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..598950bd1ad2a480fb590033fd3dcf17bdead55d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modeling_moonshine.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modular_moonshine.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modular_moonshine.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e080555396d84a50ecbfc5514ff50137941f753b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modular_moonshine.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91ec6e7aed336ecebebc54d5974f27dea0b6ada3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/tokenization_myt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/tokenization_myt5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73d6032bffebb116becfff55a344a6b18b3b6f58 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/tokenization_myt5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc4c520fd348470ea81123168f26f94146c4942e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fe2fb222480431bea3e35b6fb610900ba5396f1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f09a78b05c282a3dd5a39a368e9ae5c6511cc92 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/processing_nougat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/processing_nougat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42cdb117f51f1507a353d5726dec888b5a55799d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/processing_nougat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/tokenization_nougat_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/tokenization_nougat_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7a9c46742776f1377e1896457f2875a7754043c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/tokenization_nougat_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21a580d20c745314883a9cde18e30fe51ee0042e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/configuration_nystromformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/configuration_nystromformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e296905fc0fdbdba55e31521ea2a52db1248f4b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/configuration_nystromformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/modeling_nystromformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/modeling_nystromformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d0c8d32d468a2aba03d93e0f3cd16a94a08424d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/modeling_nystromformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..256074f5555278e51a83b42c41ae539821154f55 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf511dbe981ab692d3c02b6a80294e022f6088da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aba60e0321e72c7ca34827d38a99d1a03bfe7203 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..680735939b19a33fac01c141c0076f2825ca42f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a823be18326aa479a98a2d8dccc5c3f0686c76e9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a0801a84f05621f46a5b84926660547989f3e60 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7a24927b04201e059ee73bffada8cbe4ee8444d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..178a500813f8fdede8434566de5cc3b9ea6452b4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ef81dc3a581ce85e3e122d9ee4a42db68010867 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/configuration_olmoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/configuration_olmoe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a48b10a8d517af501d245073a733dfe6c3760579 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/configuration_olmoe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/modeling_olmoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/modeling_olmoe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40b0330cf6cd9bd743e70d68aae0e48f75a63f90 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/modeling_olmoe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26831561e7f5379aeab71ec1b7abcaa27d3b1c65 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a66f7809fe8f00b2ac974d151f371d4e53a0ee3e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4eeb0ba8a08f1207b8d8f00a6ce8440481d45f6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df77135e36d5bca0e05f36f18486ede2ed647d6c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..014feb11206605a3c02029f979c713fdbdf8b1b9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..000d9df0872e6bd674858e929283f41ecd8e68f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f54cf8edeaa734ca8cc9368399cfb7c11d51930e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..584e4a026d33dc59ace5a66cdaad98fc09ef1de8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c0c37505c2745b52d741eb0d5a39e26255a036b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4409a72784b7033c8989d4692dcba0b1627cab0f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/configuration_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/configuration_openai.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a4a4c5d7e5aea7d20935e0729e36d3342f7f7cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/configuration_openai.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_openai.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd54402e1f6a98d8840392084a817ae224763724 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_openai.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_tf_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_tf_openai.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16db04d809897b99cd9a71714533e14b6a0ddb50 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_tf_openai.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb8728296758436a87d1dfaaaf7d1f8bb4d2bf36 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..926de034177fbf340531007dfd5b56b23ec0cd27 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..280f3bb9cf58e2f2079a6fa7be987e19204736cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a989bfb9da1344399494a26b863c370bcb37a967 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3febf9c99d494d11412ca3bb6f0d982ab5456e4a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..baee2daba1f23f8440a64ff331cef13d32ec0f7f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24c1d11a3b13899c4908d5e1be8b7e6777b066e3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9048afe6adbdc0ad36007e02f60e899cae677c55 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_paligemma import * + from .modeling_paligemma import * + from .processing_paligemma import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..495ce5cd208119bd67e99dcc26f9fa7799d06e34 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/configuration_paligemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/configuration_paligemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b97fef8641a29ca45dd3ef7070dd93b1a40ec9f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/configuration_paligemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/modeling_paligemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/modeling_paligemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e90adc77c5a261f366e2e134c300c057fadf0461 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/modeling_paligemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/processing_paligemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/processing_paligemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..158461d8471ac34b79ca94448db7729e79af3581 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/processing_paligemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/configuration_paligemma.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/configuration_paligemma.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ee4b3b45c2637d804f7eafa7f9cc7afa2638f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/configuration_paligemma.py @@ -0,0 +1,128 @@ +# coding=utf-8 +# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PaliGemmamodel configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class PaliGemmaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PaliGemmaForConditionalGeneration`]. It is used to instantiate an + PaliGemmamodel according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the PaliGemma-2B. + + e.g. [paligemma-hf/paligemma-2b](https://huggingface.co/paligemma-hf/paligemma-2b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`PaliGemmaVisionConfig`, *optional*): + Custom vision config or dict + text_config (`Union[AutoConfig, dict]`, *optional*): + The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`. + image_token_index (`int`, *optional*, defaults to 256000): + The image token index to encode the image prompt. + vocab_size (`int`, *optional*, defaults to 257152): + Vocabulary size of the PaliGemmamodel. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`~PaliGemmaForConditionalGeneration`] + projection_dim (`int`, *optional*, defaults to 2048): + Dimension of the multimodal projection space. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden layer of the Language model. + + Example: + + ```python + >>> from transformers import PaliGemmaForConditionalGeneration, PaliGemmaConfig, SiglipVisionConfig, GemmaConfig + + >>> # Initializing a Siglip-like vision config + >>> vision_config = SiglipVisionConfig() + + >>> # Initializing a PaliGemma config + >>> text_config = GemmaConfig() + + >>> # Initializing a PaliGemma paligemma-3b-224 style configuration + >>> configuration = PaliGemmaConfig(vision_config, text_config) + + >>> # Initializing a model from the paligemma-3b-224 style configuration + >>> model = PaliGemmaForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "paligemma" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_index=256000, + vocab_size=257152, + projection_dim=2048, + hidden_size=2048, + **kwargs, + ): + self.image_token_index = image_token_index + self.projection_dim = projection_dim + self.hidden_size = hidden_size + self.vision_config = vision_config + self.is_encoder_decoder = False + + if isinstance(self.vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + self.vision_config = CONFIG_MAPPING["siglip_vision_model"]( + intermediate_size=4096, + hidden_size=1152, + patch_size=14, + image_size=224, + num_hidden_layers=27, + num_attention_heads=16, + vocab_size=257152, + vision_use_head=False, + ) + + self.text_config = text_config + if isinstance(self.text_config, dict): + text_config["model_type"] = text_config.get("model_type", "gemma") + self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + self.text_config = CONFIG_MAPPING["gemma"]( + hidden_size=2048, + num_hidden_layers=18, + intermediate_size=16384, + num_attention_heads=8, + num_key_value_heads=1, + is_encoder_decoder=False, + vocab_size=vocab_size, + ) + self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2 + self.vision_config.projection_dim = projection_dim + super().__init__(**kwargs) + + +__all__ = ["PaliGemmaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.py new file mode 100644 index 0000000000000000000000000000000000000000..abd8595e24abb7d3850a19144ebe67707b85f7ae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.py @@ -0,0 +1,625 @@ +# coding=utf-8 +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch PaliGemmamodel.""" + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...cache_utils import Cache, StaticCache +from ...generation import GenerationMixin +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + ModelOutput, + TransformersKwargs, + auto_docstring, + can_return_tuple, + logging, +) +from ..auto import AutoModel +from .configuration_paligemma import PaliGemmaConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Paligemma outputs, with hidden states and attentions. + """ +) +class PaligemmaModelOutputWithPast(BaseModelOutputWithPast): + r""" + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for PaliGemma causal language model (or autoregressive) outputs. + """ +) +class PaliGemmaCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder after projecting last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +class PaliGemmaMultiModalProjector(nn.Module): + def __init__(self, config: PaliGemmaConfig): + super().__init__() + self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True) + + def forward(self, image_features): + hidden_states = self.linear(image_features) + + return hidden_states + + +@auto_docstring +class PaliGemmaPreTrainedModel(PreTrainedModel): + config: PaliGemmaConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _no_split_modules = ["PaliGemmaMultiModalProjector"] + _skip_keys_device_placement = "past_key_values" + + _can_compile_fullgraph = False + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _supports_attention_backend = True + + def _init_weights(self, module): + # important: this ported version of PaliGemmaisn't meant for training from scratch - only + # inference and fine-tuning + std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range) + + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring( + custom_intro=""" + The Base Paligemma model which consists of a vision backbone and a language model without language modeling head., + """ +) +class PaliGemmaModel(PaliGemmaPreTrainedModel): + _checkpoint_conversion_mapping = {"language_model.model": "language_model"} + # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch + accepts_loss_kwargs = False + + def __init__(self, config: PaliGemmaConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config=config.vision_config) + self.multi_modal_projector = PaliGemmaMultiModalProjector(config) + self.vocab_size = config.text_config.vocab_size + + language_model = AutoModel.from_config(config=config.text_config) + self.language_model = language_model + + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self.text_config_dtype = self.config.get_text_config().dtype or self.dtype + self.post_init() + + # Copied from transformers.models.llava.modeling_llava.LlavaModel.get_input_embeddings with Llava->PaliGemma + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + # Copied from transformers.models.llava.modeling_llava.LlavaModel.set_input_embeddings with Llava->PaliGemma + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def _update_causal_mask( + self, + attention_mask, + token_type_ids=None, + past_key_values=None, + cache_position=None, + input_tensor=None, + is_training: Optional[bool] = None, + ): + if self.config.text_config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + is_training = is_training if is_training is not None else self.training + using_static_cache = isinstance(past_key_values, StaticCache) + min_dtype = torch.finfo(self.text_config_dtype).min + if input_tensor is None: + input_tensor = attention_mask + + inputs_lead_dim, sequence_length = input_tensor.shape[:2] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else cache_position[0] + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + return attention_mask + + causal_mask = torch.full( + (sequence_length, target_length), + fill_value=min_dtype, + dtype=self.text_config_dtype, + device=cache_position.device, + ) + # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below + if sequence_length != 1: + if is_training: + causal_mask = torch.triu(causal_mask, diagonal=1) + else: + causal_mask[:, :sequence_length] = 0.0 + + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + + # First unmask prefix tokens during training + if is_training: + if token_type_ids is None: + raise ValueError("Token type ids must be provided during training") + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0 + ) + + # Then apply padding mask (will mask pad tokens) + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_outputs = self.vision_tower(pixel_values) + selected_image_feature = image_outputs.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + image_features = image_features / (self.config.text_config.hidden_size**0.5) + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, PaligemmaModelOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration + + >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224") + >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224") + + >>> prompt = "Where is the cat standing?" + >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs,) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Where is the cat standing?\nsnow" + ```""" + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + is_training = token_type_ids is not None and labels is not None + + # Replace image id with PAD if the image token if OOV, to avoid index-errors + if input_ids is not None and self.config.image_token_id >= self.vocab_size: + special_image_mask = input_ids == self.config.image_token_id + llm_input_ids = input_ids.clone() + llm_input_ids[special_image_mask] = 0 + else: + llm_input_ids = input_ids + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(llm_input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + 1 # Paligemma positions are 1-indexed + + # Merge text and images + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + causal_mask = self._update_causal_mask( + attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training + ) + outputs = self.language_model( + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + return PaligemmaModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The Base Paligemma model which consists of a vision backbone and a language model without language modeling head., + """ +) +class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = { + "^language_model.model": "model.language_model", + "^vision_tower": "model.vision_tower", + "^multi_modal_projector": "model.multi_modal_projector", + "^language_model.lm_head": "lm_head", + } + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: PaliGemmaConfig): + super().__init__(config) + self.model = PaliGemmaModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features(self, pixel_values): + return self.model.get_image_features(pixel_values) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, PaliGemmaCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration + + >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224") + >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224") + + >>> prompt = "Where is the cat standing?" + >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs,) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Where is the cat standing?\nsnow" + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return PaliGemmaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + pixel_values=None, + attention_mask=None, + token_type_ids=None, + use_cache=True, + logits_to_keep=None, + labels=None, + **kwargs, + ): + # Overwritten -- custom `position_ids` and `pixel_values` handling + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + cache_position=cache_position, + use_cache=use_cache, + logits_to_keep=logits_to_keep, + token_type_ids=token_type_ids, + **kwargs, + ) + + # position_ids in Paligemma are 1-indexed + if model_inputs.get("position_ids") is not None: + model_inputs["position_ids"] += 1 + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + is_training = token_type_ids is not None and labels is not None + is_static_hybrid_cache = isinstance(past_key_values, StaticCache) and any(past_key_values.is_sliding) + if cache_position[0] == 0 and is_static_hybrid_cache: + input_tensor = inputs_embeds if inputs_embeds is not None else input_ids + causal_mask = self.model._update_causal_mask( + attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training + ) + model_inputs["attention_mask"] = causal_mask + + return model_inputs + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +__all__ = ["PaliGemmaForConditionalGeneration", "PaliGemmaPreTrainedModel", "PaliGemmaModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/processing_paligemma.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/processing_paligemma.py new file mode 100644 index 0000000000000000000000000000000000000000..242627a0eb71a98f71ca0aadf260cc51df3fb085 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/processing_paligemma.py @@ -0,0 +1,336 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for PaliGemma. +""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, is_valid_image +from ...processing_utils import ( + ImagesKwargs, + MultiModalData, + ProcessingKwargs, + ProcessorMixin, + TextKwargs, + Unpack, +) +from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput +from ...utils import logging + + +logger = logging.get_logger(__name__) + +IMAGE_TOKEN = "" +EXTRA_TOKENS = [f"4}>" for i in range(1024)] + [f"3}>" for i in range(128)] + + +class PaliGemmaTextKwargs(TextKwargs): + suffix: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] + + +class PaliGemmaImagesKwargs(ImagesKwargs): + do_convert_rgb: Optional[bool] + + +class PaliGemmaProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: PaliGemmaTextKwargs + images_kwargs: PaliGemmaImagesKwargs + _defaults = { + "text_kwargs": { + "padding": False, + "return_mm_token_type_ids": False, + }, + "images_kwargs": { + "data_format": "channels_first", + }, + } + + +# Copied from transformers.models.idefics2.processing_idefics2.is_url +def is_url(val) -> bool: + return isinstance(val, str) and val.startswith("http") + + +# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url +def is_image_or_image_url(elem): + return is_url(elem) or is_valid_image(elem) + + +def _is_str_or_image(elem): + return isinstance(elem, (str)) or is_image_or_image_url(elem) + + +def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_images): + """ + Builds a string from the input prompt and image tokens. + For example, for the call: + build_string_from_input( + prompt="Prefix str" + bos_token="", + image_seq_len=3, + image_token="", + ) + The output will be: + "Initial str" + Args: + prompt (`list[Union[str, ImageInput]]`): The input prompt. + bos_token (`str`): The beginning of sentence token. + image_seq_len (`int`): The length of the image sequence. + image_token (`str`): The image token. + num_images (`int`): Number of images in the prompt. + """ + return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n" + + +class PaliGemmaProcessor(ProcessorMixin): + r""" + Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor. + + [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`GemmaTokenizerFast`]. See the + [`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information. + + Args: + image_processor ([`SiglipImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`GemmaTokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast") + tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast") + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + **kwargs, + ): + if not hasattr(image_processor, "image_seq_length"): + raise ValueError("Image processor is missing an `image_seq_length` attribute.") + + self.image_seq_length = image_processor.image_seq_length + + if not hasattr(tokenizer, "image_token"): + image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True) + tokens_to_add = {"additional_special_tokens": [image_token]} + tokenizer.add_special_tokens(tokens_to_add) + self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + self.image_token = IMAGE_TOKEN + else: + self.image_token_id = tokenizer.image_token_id + self.image_token = tokenizer.image_token + + tokenizer.add_tokens(EXTRA_TOKENS) + tokenizer.add_bos_token = False + tokenizer.add_eos_token = False + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[PaliGemmaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to + the prompt in `text`, and will be placed after the prompt. This is because attention is handled differently for + the prefix and the suffix. For instance, + ```python + image = PIL_cow_image + prompt = "answer en Where is the cow standing?" + suffix = "on the beach" + inputs = processor(text=prompt, images=image, suffix=suffix) + ``` + Here `inputs` will contain the `input_ids` and `token_type_ids` that follow + ```python + inputs["input_ids"][:, 256:] + # tensor([[ 2, 6006, 603, 573, 13910, 9980, 235336, 108, 477, 573, 8318]]) + inputs["token_type_ids"][:, 256:] + tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]) + ``` + Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type. + + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + suffix (`str`, `list[str]`, `list[list[str]]`): + The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md + for more information. If your prompt is " What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench". + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix` + is provided, the `input_ids` will also contain the suffix input ids. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **labels** -- Labels compatible with training if `suffix` is not None + """ + + output_kwargs = self._merge_kwargs( + PaliGemmaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + suffix = output_kwargs["text_kwargs"].pop("suffix", None) + + return_token_type_ids = suffix is not None + + if images is None: + raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.") + if text is None: + logger.warning_once( + "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model." + ) + text = "" + + if _is_str_or_image(text): + text = [text] + elif isinstance(text, list) and _is_str_or_image(text[0]): + pass + + if text is not None and images is not None: + if not any(IMAGE_TOKEN in sample for sample in text): + logger.warning( + "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " + "image tokens in the text, as many tokens as there are images per each text. It is recommended to " + "add `` tokens in the very beginning of your text. For this call, we will infer how many images " + "each text has and add special tokens." + ) + + if isinstance(text, list) and isinstance(images, list): + if len(images) != len(text): + raise ValueError( + f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image or list of images." + ) + + # make a nested list of lists to be able to iterate over the images and text below + if is_valid_image(images): + images = [[images]] + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + images = [[image] for image in images] + elif not ( + isinstance(images, (list, tuple)) + and isinstance(images[0], (list, tuple)) + and is_valid_image(images[0][0]) + ): + raise ValueError("images must be an image, list of images or list of list of images") + + input_strings = [ + build_string_from_input( + prompt=prompt, + bos_token=self.tokenizer.bos_token, + image_seq_len=self.image_seq_length, + image_token=IMAGE_TOKEN, + num_images=len(image_list) if isinstance(image_list, list) else 1, + ) + for prompt, image_list in zip(text, images) + ] + else: + expanded_samples = [] + for sample in text: + expanded_sample = sample.replace(IMAGE_TOKEN, IMAGE_TOKEN * self.image_seq_length) + bos_rfind_index = expanded_sample.rfind(IMAGE_TOKEN) + bos_index = bos_rfind_index + len(IMAGE_TOKEN) if bos_rfind_index != -1 else 0 + expanded_sample = ( + expanded_sample[:bos_index] + self.tokenizer.bos_token + expanded_sample[bos_index:] + ) + expanded_samples.append(expanded_sample) + input_strings = [f"{sample}\n" for sample in expanded_samples] + + if suffix is not None and _is_str_or_image(suffix): + suffix = [suffix] + if suffix is not None: + suffix = [sfx + self.tokenizer.eos_token for sfx in suffix] + pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None) + inputs = self.tokenizer( + input_strings, + text_pair=suffix, + return_token_type_ids=return_token_type_ids, + **output_kwargs["text_kwargs"], + ) + self._check_special_mm_tokens(input_strings, inputs, modalities=["image"]) + + return_data = {**inputs, "pixel_values": pixel_values} + + if return_token_type_ids: + labels = np.array(inputs["input_ids"]) + labels[np.array(inputs["token_type_ids"]) == 0] = -100 + return_data.update({"labels": labels}) + + if return_mm_token_type_ids: + array_ids = np.array(return_data["input_ids"]) + mm_token_type_ids = np.zeros_like(return_data["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + return_data["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data=return_data, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (list[list[str]], *optional*): + The input sizes formatted as (height, width) per each image. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + vision_data = {} + if image_sizes is not None: + num_image_tokens = [self.image_seq_length] * len(image_sizes) + num_image_patches = [1] * len(image_sizes) + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + return MultiModalData(**vision_data) + + +__all__ = ["PaliGemmaProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87f1a02b801dea628890536045de4b4ffa06c7d1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/configuration_patchtst.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/configuration_patchtst.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aedddb44be5c37cd2701350e33a3a37f358c333e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/configuration_patchtst.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/modeling_patchtst.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/modeling_patchtst.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60ebacf223ae75fd25974633260fdece52480b6b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/modeling_patchtst.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4903c400f9827239f7920b7e2c9bfddfda48eecd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_pegasus import * + from .modeling_flax_pegasus import * + from .modeling_pegasus import * + from .modeling_tf_pegasus import * + from .tokenization_pegasus import * + from .tokenization_pegasus_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__pycache__/modeling_flax_pegasus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__pycache__/modeling_flax_pegasus.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c6994b584a281afc745e6e47b1427ff483d892b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__pycache__/modeling_flax_pegasus.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/configuration_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/configuration_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..3c27f7d44d952d769ae362b000f87b3dd6a1cfd5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/configuration_pegasus.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PEGASUS model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class PegasusConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PegasusModel`]. It is used to instantiate an + PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the PEGASUS + [google/pegasus-large](https://huggingface.co/google/pegasus-large) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`PegasusModel`] or [`TFPegasusModel`]. + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_layers (`int`, *optional*, defaults to 12): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + scale_embedding (`bool`, *optional*, defaults to `False`): + Scale embeddings by diving by sqrt(d_model). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models) + forced_eos_token_id (`int`, *optional*, defaults to 1): + The id of the token to force as the last generated token when `max_length` is reached. Usually set to + `eos_token_id`. + + Example: + + ```python + >>> from transformers import PegasusConfig, PegasusModel + + >>> # Initializing a PEGASUS google/pegasus-large style configuration + >>> configuration = PegasusConfig() + + >>> # Initializing a model (with random weights) from the google/pegasus-large style configuration + >>> model = PegasusModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "pegasus" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=50265, + max_position_embeddings=1024, + encoder_layers=12, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=12, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + use_cache=True, + is_encoder_decoder=True, + activation_function="gelu", + d_model=1024, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + decoder_start_token_id=0, + scale_embedding=False, + pad_token_id=0, + eos_token_id=1, + forced_eos_token_id=1, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + super().__init__( + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + +__all__ = ["PegasusConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_flax_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_flax_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..ddf0ae492407c17bbc678b82b3d19e34da7d8a75 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_flax_pegasus.py @@ -0,0 +1,1532 @@ +# coding=utf-8 +# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax PEGASUS model.""" + +import math +import random +from functools import partial +from typing import Callable, Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax +from jax.random import PRNGKey + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxSeq2SeqLMOutput, + FlaxSeq2SeqModelOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + add_start_docstrings_to_model_forward, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import add_start_docstrings, logging, replace_return_docstrings +from .configuration_pegasus import PegasusConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/pegasus-large" +_CONFIG_FOR_DOC = "PegasusConfig" + +PEGASUS_START_DOCSTRING = r""" + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`PegasusConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +PEGASUS_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +PEGASUS_ENCODE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +PEGASUS_DECODE_INPUTS_DOCSTRING = r""" + Args: + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + encoder_outputs (`tuple(tuple(jnp.ndarray)`): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right +def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = jnp.zeros_like(input_ids) + shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1]) + shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id) + + shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) + return shifted_input_ids + + +# Copied from transformers.models.marian.modeling_flax_marian.create_sinusoidal_positions +def create_sinusoidal_positions(n_pos, dim): + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) + sentinel = dim // 2 + dim % 2 + out = np.zeros_like(position_enc) + out[:, 0:sentinel] = np.sin(position_enc[:, 0::2]) + out[:, sentinel:] = np.cos(position_enc[:, 1::2]) + + return jnp.array(out) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->Pegasus +class FlaxPegasusAttention(nn.Module): + config: PegasusConfig + embed_dim: int + num_heads: int + dropout: float = 0.0 + causal: bool = False + bias: bool = True + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self) -> None: + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {self.num_heads})." + ) + + dense = partial( + nn.Dense, + self.embed_dim, + use_bias=self.bias, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense() + self.out_proj = dense() + + self.dropout_layer = nn.Dropout(rate=self.dropout) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,)) + + @nn.compact + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states: jnp.ndarray, + key_value_states: Optional[jnp.ndarray] = None, + attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + ) -> tuple[jnp.ndarray]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.k_proj(key_value_states) + value_states = self.v_proj(key_value_states) + else: + # self_attention + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.dropout, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = self._merge_heads(attn_output) + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer with MBart->Pegasus +class FlaxPegasusEncoderLayer(nn.Module): + config: PegasusConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxPegasusAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.encoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + self.fc1 = nn.Dense( + self.config.encoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + output_attentions: bool = True, + deterministic: bool = True, + ) -> tuple[jnp.ndarray]: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection with Bart->Pegasus +class FlaxPegasusEncoderLayerCollection(nn.Module): + config: PegasusConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxPegasusEncoderLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.encoder_layers) + ] + self.layerdrop = self.config.encoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions, + deterministic, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer with MBart->Pegasus +class FlaxPegasusDecoderLayer(nn.Module): + config: PegasusConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxPegasusAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + causal=True, + dtype=self.dtype, + ) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.encoder_attn = FlaxPegasusAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.fc1 = nn.Dense( + self.config.decoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = True, + deterministic: bool = True, + ) -> tuple[jnp.ndarray]: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection with Bart->Pegasus +class FlaxPegasusDecoderLayerCollection(nn.Module): + config: PegasusConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxPegasusDecoderLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.decoder_layers) + ] + self.layerdrop = self.config.decoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): + layer_outputs = (None, None, None) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + deterministic=deterministic, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class FlaxPegasusEncoder(nn.Module): + config: PegasusConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_source_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0 + + self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim) + self.layers = FlaxPegasusEncoderLayerCollection(self.config, self.dtype) + self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + # embed positions + embed_pos = jnp.take(self.embed_positions, position_ids, axis=0) + # explicitly cast the positions here, since self.embed_positions are not registered as parameters + embed_pos = embed_pos.astype(inputs_embeds.dtype) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + outputs = self.layers( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = outputs[0] + last_hidden_state = self.layer_norm(last_hidden_state) + + # update the last element in `hidden_states` after applying `layernorm` above + hidden_states = None + if output_hidden_states: + hidden_states = outputs[1] + hidden_states = hidden_states[:-1] + (last_hidden_state,) + + if not return_dict: + outputs = (last_hidden_state, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:]) + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=last_hidden_state, + hidden_states=hidden_states, + attentions=outputs.attentions, + ) + + +class FlaxPegasusDecoder(nn.Module): + config: PegasusConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_target_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0 + + self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim) + + self.layers = FlaxPegasusDecoderLayerCollection(self.config, self.dtype) + self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + # embed positions + positions = jnp.take(self.embed_positions, position_ids, axis=0) + # explicitly cast the positions here, since self.embed_positions are not registered as parameters + positions = positions.astype(inputs_embeds.dtype) + + hidden_states = inputs_embeds + positions + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + outputs = self.layers( + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = outputs[0] + last_hidden_state = self.layer_norm(last_hidden_state) + + # update the last element in `hidden_states` after applying `layernorm` above + hidden_states = None + if output_hidden_states: + hidden_states = outputs[1] + hidden_states = hidden_states[:-1] + (last_hidden_state,) + + if not return_dict: + outputs = (last_hidden_state, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:]) + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=last_hidden_state, + hidden_states=hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartModule with Bart->Pegasus +class FlaxPegasusModule(nn.Module): + config: PegasusConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.shared = nn.Embed( + self.config.vocab_size, + self.config.d_model, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + + self.encoder = FlaxPegasusEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + self.decoder = FlaxPegasusDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + + def _get_encoder_module(self): + return self.encoder + + def _get_decoder_module(self): + return self.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return FlaxSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel): + config_class = PegasusConfig + base_model_prefix: str = "model" + module_class: nn.Module = None + + def __init__( + self, + config: PegasusConfig, + input_shape: tuple[int] = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + decoder_input_ids = input_ids + decoder_attention_mask = jnp.ones_like(input_ids) + + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length, encoder_outputs): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`): + `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: + `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) + is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + """ + # init input variables to retrieve cache + decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4") + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + decoder_position_ids = jnp.broadcast_to( + jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape + ) + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + init_variables = self.module.init( + jax.random.PRNGKey(0), + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + init_cache=True, + method=_decoder_forward, # we only need to call the decoder to init the cache + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings(PEGASUS_ENCODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=PegasusConfig) + def encode( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration + + >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large") + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="np") + >>> encoder_outputs = model.encode(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs): + encode_module = module._get_encoder_module() + return encode_module(input_ids, attention_mask, position_ids, **kwargs) + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + method=_encoder_forward, + ) + + @add_start_docstrings(PEGASUS_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=PegasusConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: Optional[dict] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration + + >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large") + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="np") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> last_decoder_hidden_states = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxPegasusAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past = outputs + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past = outputs + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING) + def __call__( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + decoder_input_ids: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # prepare encoder inputs + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # prepare decoder inputs + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id + ) + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + if decoder_position_ids is None: + batch_size, sequence_length = decoder_input_ids.shape + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + ) + + +@add_start_docstrings( + "The bare Pegasus Model transformer outputting raw hidden-states without any specific head on top.", + PEGASUS_START_DOCSTRING, +) +class FlaxPegasusModel(FlaxPegasusPreTrainedModel): + config: PegasusConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + module_class = FlaxPegasusModule + + +append_call_sample_docstring(FlaxPegasusModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->Pegasus +class FlaxPegasusForConditionalGenerationModule(nn.Module): + config: PegasusConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.model = FlaxPegasusModule(config=self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.model.shared.num_embeddings, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings)) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = self.model.variables["params"]["shared"]["embedding"] + lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return output + + return FlaxSeq2SeqLMOutput( + logits=lm_logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + "The PEGASUS Model with a language modeling head. Can be used for summarization.", PEGASUS_START_DOCSTRING +) +class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel): + module_class = FlaxPegasusForConditionalGenerationModule + dtype: jnp.dtype = jnp.float32 + + @add_start_docstrings(PEGASUS_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=PegasusConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: Optional[dict] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + deterministic: bool = True, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration + + >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large") + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="np") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> logits = outputs.logits + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxPegasusAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + outputs = decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = module.model.variables["params"]["shared"]["embedding"] + lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = module.lm_head(hidden_states) + + lm_logits += module.final_logits_bias.astype(self.dtype) + return lm_logits, outputs + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + if past_key_values is None: + lm_logits, decoder_outputs = outputs + else: + (lm_logits, decoder_outputs), past = outputs + + if return_dict: + outputs = FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + ) + else: + outputs = (lm_logits,) + decoder_outputs[1:] + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + max_length, + attention_mask: Optional[jax.Array] = None, + decoder_attention_mask: Optional[jax.Array] = None, + encoder_outputs=None, + **kwargs, + ): + # initializing the cache + batch_size, seq_length = decoder_input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length, encoder_outputs) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if decoder_attention_mask is not None: + position_ids = decoder_attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "encoder_attention_mask": attention_mask, + "decoder_attention_mask": extended_attention_mask, + "decoder_position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1 + return model_kwargs + + +FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """ + Returns: + + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration + + >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large') + >>> tokenizer = AutoTokenizer.from_pretrained('google/pegasus-large') + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='np') + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs['input_ids']).sequences + >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + ``` + + Mask filling example: + + ```python + >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large") + >>> TXT = "My friends are but they eat too many carbs." + + >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large") + >>> input_ids = tokenizer([TXT], return_tensors="np")["input_ids"] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0) + >>> values, predictions = jax.lax.top_k(probs) + + >>> tokenizer.decode(predictions).split() + ``` +""" + +overwrite_call_docstring( + FlaxPegasusForConditionalGeneration, PEGASUS_INPUTS_DOCSTRING + FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING +) +append_replace_return_docstrings( + FlaxPegasusForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC +) + + +__all__ = ["FlaxPegasusForConditionalGeneration", "FlaxPegasusModel", "FlaxPegasusPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..dc3a8005acac814393789cfa6146f413af2863bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_pegasus.py @@ -0,0 +1,1671 @@ +# coding=utf-8 +# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch PEGASUS model.""" + +import copy +import math +from typing import Callable, Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, +) +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + auto_docstring, + is_torch_flex_attn_available, + is_torchdynamo_compiling, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_pegasus import PegasusConfig + + +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import BlockMask, make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.bart.modeling_bart.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Pegasus +class PegasusSinusoidalPositionalEmbedding(nn.Embedding): + """This module produces sinusoidal positional embeddings of any length.""" + + def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None: + super().__init__(num_positions, embedding_dim) + + def _init_weight(self): + """ + Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in + the 2nd half of the vector. [dim // 2:] + """ + n_pos, dim = self.weight.shape + position_enc = np.array( + [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)] + ) + out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False) + sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1 + out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + self.weight = nn.Parameter(out, requires_grad=False) + + @torch.no_grad() + def forward( + self, input_ids_shape: torch.Size, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """`input_ids_shape` is expected to be [bsz x seqlen].""" + if position_ids is None: + bsz, seq_len = input_ids_shape[:2] + position_ids = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(position_ids) + + +# Copied from transformers.models.bart.modeling_bart.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Pegasus +class PegasusAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[PegasusConfig] = None, + layer_idx: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.Tensor] = None, + # TODO: we need a refactor so that the different attention modules can get their specific kwargs + # ATM, we have mixed things encoder, decoder, and encoder-decoder attn + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + # determine input shapes + bsz, tgt_len = hidden_states.shape[:-1] + src_len = key_value_states.shape[1] if is_cross_attention else tgt_len + + q_input_shape = (bsz, tgt_len, -1, self.head_dim) + kv_input_shape = (bsz, src_len, -1, self.head_dim) + + # get query proj + query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2) + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states = self.k_proj(current_states) + value_states = self.v_proj(current_states) + key_states = key_states.view(*kv_input_shape).transpose(1, 2) + value_states = value_states.view(*kv_input_shape).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + output_attentions=output_attentions, + head_mask=layer_head_mask, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Pegasus, MBART->PEGASUS +class PegasusEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: PegasusConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = PegasusAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + output_attentions: bool = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16: + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + return hidden_states, attn_weights + + +# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Pegasus, MBART->PEGASUS +class PegasusDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: PegasusConfig, layer_idx: Optional[int] = None): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = PegasusAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + config=config, + layer_idx=layer_idx, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = PegasusAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + config=config, + layer_idx=layer_idx, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_values (`Cache`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +@auto_docstring +class PegasusPreTrainedModel(PreTrainedModel): + config: PegasusConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, PegasusSinusoidalPositionalEmbedding): + module._init_weight() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Optional[Union[torch.Tensor, "BlockMask"]], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + ): + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + # Other attention flavors support in-built causal (when `mask is None`) + # while we need to create our specific block mask regardless + elif attention_mask is None: + attention_mask = make_flex_block_causal_mask( + torch.ones( + size=(input_tensor.shape[0], input_tensor.shape[1]), + device=attention_mask.device, + ) + ) + return attention_mask + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask + + +class PegasusEncoder(PegasusPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`PegasusEncoderLayer`]. + + Args: + config: PegasusConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx) + + self.embed_positions = PegasusSinusoidalPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + self.padding_idx, + ) + self.layers = nn.ModuleList([PegasusEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings matrix of the model if `new_num_position_embeddings != + config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embeddings. If position embeddings are learned, increasing the size will add + newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If + position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will + add correct vectors at the end following the position encoding algorithm, whereas reducing the size + will remove vectors from the end. + """ + logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...") + self.config.max_position_embeddings = new_num_position_embeddings + + self.embed_positions = PegasusSinusoidalPositionalEmbedding( + self.config.max_position_embeddings, + self.config.d_model, + self.padding_idx, + ) + self.embed_positions._init_weight() + self.embed_positions.to(self.device) + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings matrix + """ + return self.embed_positions + + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + + hidden_states = inputs_embeds + embed_pos + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + attention_mask = self._update_full_mask( + attention_mask, + inputs_embeds, + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class PegasusDecoder(PegasusPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`PegasusDecoderLayer`] + + Args: + config: PegasusConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) + + self.embed_positions = PegasusSinusoidalPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + self.padding_idx, + ) + self.layers = nn.ModuleList([PegasusDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)]) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings matrix of the model if `new_num_position_embeddings != + config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embeddings. If position embeddings are learned, increasing the size will add + newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If + position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will + add correct vectors at the end following the position encoding algorithm, whereas reducing the size + will remove vectors from the end. + """ + logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...") + self.config.max_position_embeddings = new_num_position_embeddings + + self.embed_positions = PegasusSinusoidalPositionalEmbedding( + self.config.max_position_embeddings, + self.config.d_model, + self.padding_idx, + ) + self.embed_positions._init_weight() + self.embed_positions.to(self.device) + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings matrix + """ + return self.embed_positions + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + cache_position=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.shape + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input) + + # important to apply scale outside of `if` in case users pass `embeds` + inputs_embeds = inputs_embeds * self.embed_scale + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." + ) + use_cache = False + + # initialize `past_key_values` + if use_cache and past_key_values is None: + past_key_values = ( + EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if encoder_hidden_states is not None + else DynamicCache(config=self.config) + ) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + batch_size, seq_length = inputs_embeds.size()[:-1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) + + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + + self_attn_cache = ( + past_key_values.self_attention_cache + if isinstance(past_key_values, EncoderDecoderCache) + else past_key_values + ) + + causal_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + self_attn_cache, + ) + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + input_shape, + inputs_embeds, + ) + + # embed positions + positions = self.embed_positions((batch_size, seq_length), past_key_values_length, position_ids=cache_position) + hidden_states = inputs_embeds + positions + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + causal_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None), + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring +class PegasusModel(PegasusPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: PegasusConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) + + self.encoder = PegasusEncoder(config, self.shared) + self.decoder = PegasusDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings matrix of the model if `new_num_position_embeddings != + config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embeddings. If position embeddings are learned, increasing the size will add + newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If + position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will + add correct vectors at the end following the position encoding algorithm, whereas reducing the size + will remove vectors from the end. + """ + self.config.max_position_embeddings = new_num_position_embeddings + self.encoder.resize_position_embeddings(new_num_position_embeddings) + self.decoder.resize_position_embeddings(new_num_position_embeddings) + + def get_position_embeddings(self) -> tuple[nn.Embedding]: + """ + Returns the position embeddings matrix + """ + return (self.encoder.get_position_embeddings(), self.decoder.get_position_embeddings()) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple, Seq2SeqModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + Example: + + ```python + >>> from transformers import AutoTokenizer, PegasusModel + + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large") + >>> model = PegasusModel.from_pretrained("google/pegasus-large") + + >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt") + >>> decoder_inputs = tokenizer("Studies show that", return_tensors="pt") + >>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_inputs.input_ids) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 4, 1024] + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The PEGASUS Model with a language modeling head. Can be used for summarization. + """ +) +class PegasusForConditionalGeneration(PegasusPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + + def __init__(self, config: PegasusConfig): + super().__init__(config) + self.model = PegasusModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings matrix of the model if `new_num_position_embeddings != + config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embeddings. If position embeddings are learned, increasing the size will add + newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If + position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will + add correct vectors at the end following the position encoding algorithm, whereas reducing the size + will remove vectors from the end. + """ + self.config.max_position_embeddings = new_num_position_embeddings + self.model.encoder.resize_position_embeddings(new_num_position_embeddings) + self.model.decoder.resize_position_embeddings(new_num_position_embeddings) + + def get_position_embeddings(self) -> tuple[nn.Embedding]: + """ + Returns the position embeddings matrix + """ + return (self.model.encoder.get_position_embeddings(), self.model.decoder.get_position_embeddings()) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple, Seq2SeqLMOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example Summarization: + + ```python + >>> from transformers import AutoTokenizer, PegasusForConditionalGeneration + + >>> model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum") + + >>> ARTICLE_TO_SUMMARIZE = ( + ... "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + ... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + ... ) + >>> inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"]) + >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "California's largest electricity provider has turned off power to hundreds of thousands of customers." + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + +# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->Pegasus +class PegasusDecoderWrapper(PegasusPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = PegasusDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +class PegasusForCausalLM(PegasusPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = PegasusDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings matrix + """ + return self.model.decoder.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings matrix of the model if `new_num_position_embeddings != + config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embeddings. If position embeddings are learned, increasing the size will add + newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If + position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will + add correct vectors at the end following the position encoding algorithm, whereas reducing the size + will remove vectors from the end. + """ + self.config.max_position_embeddings = new_num_position_embeddings + self.model.decoder.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + # Copied from transformers.models.bart.modeling_bart.BartForCausalLM.forward with Bart->Pegasus, facebook/bart-base->google/pegasus-large + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, CausalLMOutputWithCrossAttentions]: + r""" + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, PegasusForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large") + >>> model = PegasusForCausalLM.from_pretrained("google/pegasus-large", add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] + >>> list(logits.shape) == expected_shape + True + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +__all__ = ["PegasusForCausalLM", "PegasusForConditionalGeneration", "PegasusModel", "PegasusPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_tf_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_tf_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..d159fc00138dcb6ca2f14213857b6b21e97c6865 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_tf_pegasus.py @@ -0,0 +1,1573 @@ +# coding=utf-8 +# Copyright 2021, Google Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 Pegasus model.""" + +from __future__ import annotations + +import random + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPastAndCrossAttentions, + TFSeq2SeqLMOutput, + TFSeq2SeqModelOutput, +) + +# Public API +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_pegasus import PegasusConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/pegasus-large" +_CONFIG_FOR_DOC = "PegasusConfig" + + +LARGE_NEGATIVE = -1e8 + + +# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask +def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz = input_ids_shape[0] + tgt_len = input_ids_shape[1] + mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE + mask_cond = tf.range(shape_list(mask)[-1]) + + mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) + + +# Copied from transformers.models.bart.modeling_tf_bart._expand_mask +def _expand_mask(mask: tf.Tensor, tgt_len: int | None = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +# Copied from transformers.models.marian.modeling_tf_marian.TFMarianSinusoidalPositionalEmbedding with Marian->Pegasus +class TFPegasusSinusoidalPositionalEmbedding(keras.layers.Layer): + """This module produces sinusoidal positional embeddings of any length.""" + + def __init__(self, num_positions: int, embedding_dim: int, **kwargs): + super().__init__(**kwargs) + + if embedding_dim % 2 != 0: + raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported") + + self.embedding_dim = embedding_dim + self.num_positions = num_positions + + def build(self, input_shape: tf.TensorShape): + """ + Build shared token embedding layer Shared weights logic adapted from + https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + """ + + weight = self._init_weight(self.num_positions, self.embedding_dim) + + self.weight = self.add_weight( + name="embeddings", + shape=[self.num_positions, self.embedding_dim], + ) + weight = tf.cast(weight, dtype=self.weight.dtype) + + self.weight.assign(weight) + + super().build(input_shape) + + @staticmethod + def _init_weight(n_pos: int, dim: int): + """ + Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in + the 2nd half of the vector. [dim // 2:] + """ + position_enc = np.array( + [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)] + ) + table = np.zeros_like(position_enc) + # index 0 is all zero + table[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2]) + table[:, dim // 2 :] = np.cos(position_enc[:, 1::2]) + # convert to tensor + table = tf.convert_to_tensor(table) + tf.stop_gradient(table) + return table + + def call( + self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None + ): + """Input is expected to be of size [bsz x seqlen].""" + if position_ids is None: + seq_len = input_shape[1] + position_ids = tf.range(past_key_values_length, seq_len + past_key_values_length, delta=1, name="range") + return tf.gather(self.weight, position_ids) + + +# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Pegasus +class TFPegasusAttention(keras.layers.Layer): + """Multi-headed attention from "Attention Is All You Need""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + + self.num_heads = num_heads + self.dropout = keras.layers.Dropout(dropout) + self.head_dim = embed_dim // num_heads + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + key_value_states: tf.Tensor | None = None, + past_key_value: tuple[tuple[tf.Tensor]] | None = None, + attention_mask: tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple[tf.Tensor, tf.Tensor | None]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape) + key_states = tf.reshape(key_states, proj_shape) + value_states = tf.reshape(value_states, proj_shape) + + src_len = shape_list(key_states)[1] + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) + + tf.debugging.assert_equal( + shape_list(attn_weights), + [bsz * self.num_heads, tgt_len, src_len], + message=( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {shape_list(attn_weights)}" + ), + ) + + if attention_mask is not None: + tf.debugging.assert_equal( + shape_list(attention_mask), + [bsz, 1, tgt_len, src_len], + message=( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {shape_list(attention_mask)}" + ), + ) + + attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype) + attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_weights = stable_softmax(attn_weights, axis=-1) + + if layer_head_mask is not None: + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self.num_heads], + message=( + f"Head mask for a single layer should be of size {(self.num_heads)}, but is" + f" {shape_list(layer_head_mask)}" + ), + ) + + attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( + attn_weights, (bsz, self.num_heads, tgt_len, src_len) + ) + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_probs = self.dropout(attn_weights, training=training) + attn_output = tf.matmul(attn_probs, value_states) + + tf.debugging.assert_equal( + shape_list(attn_output), + [bsz * self.num_heads, tgt_len, self.head_dim], + message=( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {shape_list(attn_output)}" + ), + ) + + attn_output = tf.transpose( + tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3) + ) + attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim)) + + attn_output = self.out_proj(attn_output) + attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + + return attn_output, attn_weights, past_key_value + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build([None, None, self.embed_dim]) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build([None, None, self.embed_dim]) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build([None, None, self.embed_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.embed_dim]) + + +# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Pegasus +class TFPegasusEncoderLayer(keras.layers.Layer): + def __init__(self, config: PegasusConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFPegasusAttention( + self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" + ) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + layer_head_mask: tf.Tensor, + training: bool | None = False, + ): + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* + attention_mask (`tf.Tensor`): attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + *(encoder_attention_heads,)* + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, self_attn_weights, _ = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask + ) + + tf.debugging.assert_equal( + shape_list(hidden_states), + shape_list(residual), + message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}", + ) + + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + return hidden_states, self_attn_weights + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.encoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Pegasus +class TFPegasusDecoderLayer(keras.layers.Layer): + def __init__(self, config: PegasusConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFPegasusAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + name="self_attn", + is_decoder=True, + ) + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.encoder_attn = TFPegasusAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + name="encoder_attn", + is_decoder=True, + ) + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + cross_attn_layer_head_mask: tf.Tensor | None = None, + past_key_value: tuple[tf.Tensor] | None = None, + training: bool | None = False, + ) -> tuple[tf.Tensor, tf.Tensor, tuple[tuple[tf.Tensor]]]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* + attention_mask (`tf.Tensor`): attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + encoder_hidden_states (`tf.Tensor`): + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* + encoder_attention_mask (`tf.Tensor`): encoder attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + *(decoder_attention_heads,)* + cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. + *(decoder_attention_heads,)* + past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + return ( + hidden_states, + self_attn_weights, + cross_attn_weights, + present_key_value, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.decoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +class TFPegasusPreTrainedModel(TFPreTrainedModel): + config_class = PegasusConfig + base_model_prefix = "model" + + +PEGASUS_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`PegasusConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +PEGASUS_GENERATION_EXAMPLE = r""" + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, TFPegasusForConditionalGeneration + + >>> model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum") + >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum") + + >>> ARTICLE_TO_SUMMARIZE = ( + ... "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + ... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + ... ) + >>> inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="tf") + + >>> # Generate Summary + >>> summary_ids = model.generate(input_ids) + >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + ``` +""" + +PEGASUS_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + will be made by default and ignore pad tokens. It is not recommended to set this for most use cases. + decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tf.FloatTensor`, *optional*): + hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + of shape `(batch_size, sequence_length, hidden_size)` is a sequence of + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation output_attentions (`bool`, + *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` + under returned tensors for more detail. This argument can be used only in eager mode, in graph mode the + value in the config will be used instead. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@keras_serializable +class TFPegasusEncoder(keras.layers.Layer): + config_class = PegasusConfig + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`TFPegasusEncoderLayer`]. + + Args: + config: PegasusConfig + """ + + def __init__(self, config: PegasusConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dropout = keras.layers.Dropout(config.dropout) + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + + self.embed_tokens = embed_tokens + self.embed_positions = TFPegasusSinusoidalPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.layers = [TFPegasusEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + + def get_embed_tokens(self): + return self.embed_tokens + + def set_embed_tokens(self, embed_tokens): + self.embed_tokens = embed_tokens + + @unpack_inputs + def call( + self, + input_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ): + """ + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value + in the config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. This argument can be used only in eager mode, in graph mode the value in the config + will be used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + """ + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + hidden_states = inputs_embeds + embed_pos + hidden_states = self.dropout(hidden_states, training=training) + + # check attention mask and invert + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask) + else: + attention_mask = None + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + tf.debugging.assert_equal( + shape_list(head_mask)[0], + len(self.layers), + message=( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(head_mask)[0]}." + ), + ) + + # encoder layers + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if training and (dropout_probability < self.layerdrop): # skip the layer + continue + + hidden_states, attn = encoder_layer( + hidden_states, + attention_mask, + head_mask[idx] if head_mask is not None else None, + ) + + if output_attentions: + all_attentions += (attn,) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFPegasusDecoder(keras.layers.Layer): + config_class = PegasusConfig + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFPegasusDecoderLayer`] + + Args: + config: PegasusConfig + embed_tokens: output embedding + """ + + def __init__(self, config: PegasusConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.padding_idx = config.pad_token_id + self.embed_tokens = embed_tokens + self.layerdrop = config.decoder_layerdrop + self.embed_positions = TFPegasusSinusoidalPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + self.layers = [TFPegasusDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + + self.dropout = keras.layers.Dropout(config.dropout) + + def get_embed_tokens(self): + return self.embed_tokens + + def set_embed_tokens(self, embed_tokens): + self.embed_tokens = embed_tokens + + @unpack_inputs + def call( + self, + input_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + cross_attn_head_mask: tf.Tensor | None = None, + past_key_values: tuple[tuple[tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ): + r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value + in the config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. This argument can be used only in eager mode, in graph mode the value in the config + will be used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + """ + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0 + + # embed positions + if position_ids is None: + positions = self.embed_positions(input_shape, past_key_values_length) + else: + positions = self.embed_positions(input_shape, position_ids=position_ids) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + hidden_states = inputs_embeds + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) + else: + combined_attention_mask = _expand_mask( + tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1] + ) + + if attention_mask is not None: + combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1]) + + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) + + hidden_states = self.dropout(hidden_states + positions, training=training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None + present_key_values = () if use_cache else None + + # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired + for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]: + if attn_mask is not None: + tf.debugging.assert_equal( + shape_list(attn_mask)[0], + len(self.layers), + message=( + f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(attn_mask)[0]}." + ), + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + + if training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=head_mask[idx] if head_mask is not None else None, + cross_attn_layer_head_mask=cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + past_key_value=past_key_value, + ) + + if use_cache: + present_key_values += (present_key_value,) + + if output_attentions: + all_self_attns += (layer_self_attn,) + + if encoder_hidden_states is not None: + all_cross_attns += (layer_cross_attn,) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns + else: + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFPegasusMainLayer(keras.layers.Layer): + config_class = PegasusConfig + + def __init__(self, config: PegasusConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.shared = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.d_model, + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), + name="model.shared", + ) + # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) + self.shared.load_weight_prefix = "model.shared" + + self.encoder = TFPegasusEncoder(config, self.shared, name="encoder") + self.decoder = TFPegasusDecoder(config, self.shared, name="decoder") + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + @unpack_inputs + def call( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + decoder_position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + decoder_head_mask: tf.Tensor | None = None, + cross_attn_head_mask: tf.Tensor | None = None, + encoder_outputs: tuple | TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[tf.Tensor]] | None = None, + inputs_embeds: tf.Tensor | None = None, + decoder_inputs_embeds: tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + **kwargs, + ): + if decoder_input_ids is None and decoder_inputs_embeds is None: + use_cache = False + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput): + encoder_outputs = TFBaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False + elif not return_dict and not isinstance(encoder_outputs, tuple): + encoder_outputs = encoder_outputs.to_tuple() + + decoder_outputs = self.decoder( + decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + # The shared/tied weights expect to be in the model base namespace + # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than + # the current one. + with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + + +@add_start_docstrings( + "The bare PEGASUS Model outputting raw hidden-states without any specific head on top.", + PEGASUS_START_DOCSTRING, +) +class TFPegasusModel(TFPegasusPreTrainedModel): + def __init__(self, config: PegasusConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.model = TFPegasusMainLayer(config, name="model") + + def get_encoder(self): + return self.model.encoder + + def get_decoder(self): + return self.model.decoder + + @unpack_inputs + @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSeq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: tuple | TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + **kwargs, + ) -> TFSeq2SeqModelOutput | tuple[tf.Tensor]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqModelOutput( + last_hidden_state=output.last_hidden_state, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + + +# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer +class BiasLayer(keras.layers.Layer): + """ + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, + so all weights have to be registered in a layer. + """ + + def __init__(self, shape, initializer, trainable, name, **kwargs): + super().__init__(name=name, **kwargs) + # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of + # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see: + # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214 + self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable) + + def call(self, x): + return x + self.bias + + +@add_start_docstrings( + "The PEGASUS Model with a language modeling head. Can be used for summarization.", + PEGASUS_START_DOCSTRING, +) +class TFPegasusForConditionalGeneration(TFPegasusPreTrainedModel, TFCausalLanguageModelingLoss): + _keys_to_ignore_on_load_unexpected = [ + r"model.encoder.embed_tokens.weight", + r"model.decoder.embed_tokens.weight", + ] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFPegasusMainLayer(config, name="model") + self.use_cache = config.use_cache + # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency. + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False + ) + + def get_decoder(self): + return self.model.decoder + + def get_encoder(self): + return self.model.encoder + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def get_bias(self): + return {"final_logits_bias": self.bias_layer.bias} + + def set_bias(self, value): + # Replaces the existing layers containing bias for correct (de)serialization. + vocab_size = value["final_logits_bias"].shape[-1] + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False + ) + self.bias_layer.bias.assign(value["final_logits_bias"]) + + @unpack_inputs + @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(PEGASUS_GENERATION_EXAMPLE) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFSeq2SeqLMOutput | tuple[tf.Tensor]: + """ + labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + """ + + if labels is not None: + labels = tf.where( + labels == self.config.pad_token_id, + tf.cast(tf.fill(shape_list(labels), -100), labels.dtype), + labels, + ) + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True) + lm_logits = self.bias_layer(lm_logits) + masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + return TFSeq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, # index 1 of d outputs + decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs + decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs + cross_attentions=outputs.cross_attentions, # index 4 of d outputs + encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs + encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out + encoder_attentions=outputs.encoder_attentions, # 2 of e out + ) + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqLMOutput( + logits=output.logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + if decoder_attention_mask is not None: # xla + decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:] + elif past_key_values is not None: # no xla + past_key_values + decoder_position_ids = past_key_values[0][0].shape[2] + else: # no xla + no past_key_values + decoder_position_ids = tf.range(decoder_input_ids.shape[1]) + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "decoder_position_ids": decoder_position_ids, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) + + +__all__ = ["TFPegasusForConditionalGeneration", "TFPegasusModel", "TFPegasusPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..b8a4a1c737d1592d9362f506e93b20854edfe7a5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus.py @@ -0,0 +1,292 @@ +# coding=utf-8 +# Copyright 2020 Google and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} + + +logger = logging.get_logger(__name__) + + +# TODO ArthurZ refactor this to only use the added_tokens_encoder + + +@requires(backends=("sentencepiece",)) +class PegasusTokenizer(PreTrainedTokenizer): + r""" + Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking single token values. This is the token used when training this model with masked + language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining. + It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive + Summarization](https://huggingface.co/papers/1912.08777). + mask_token_sent (`str`, *optional*, defaults to `""`): + The token used for masking whole target sentences. This is the token used when training this model with gap + sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during + pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for + Abstractive Summarization](https://huggingface.co/papers/1912.08777). + additional_special_tokens (`List[str]`, *optional*): + Additional special tokens used by the tokenizer. If no additional_special_tokens are provided and + are used as additional special tokens corresponding to the [original PEGASUS + tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66) + that uses the tokens 2 - 104 only for pretraining + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + pad_token="", + eos_token="", + unk_token="", + mask_token="", + mask_token_sent="", + additional_special_tokens=None, + offset=103, # entries 2 - 104 are only used for pretraining + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + self.offset = offset + if additional_special_tokens is not None: + if not isinstance(additional_special_tokens, list): + raise TypeError( + f"additional_special_tokens should be of type {type(list)}, but is" + f" {type(additional_special_tokens)}" + ) + additional_special_tokens_extended = ( + ([mask_token_sent] + additional_special_tokens) + if mask_token_sent not in additional_special_tokens and mask_token_sent is not None + else additional_special_tokens + ) + # fill additional tokens with ..., in case not all additional tokens are already taken + additional_special_tokens_extended += [ + f"" for i in range(len(additional_special_tokens_extended), self.offset - 1) + ] + + if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended): + raise ValueError( + "Please make sure that the provided additional_special_tokens do not contain an incorrectly" + f" shifted list of tokens. Found {additional_special_tokens_extended}." + ) + additional_special_tokens = additional_special_tokens_extended + else: + additional_special_tokens_extended = [] + additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else [] + additional_special_tokens += [f"" for i in range(2, self.offset)] + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.mask_token_sent = mask_token_sent + self.vocab_file = vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + _added_tokens_decoder = { + 0: AddedToken(str(pad_token), special=True), + 1: AddedToken(str(eos_token), special=True), + } + + if self.mask_token_sent is not None: + _added_tokens_decoder[2] = AddedToken(mask_token_sent, special=True) + _added_tokens_decoder[3] = AddedToken(str(mask_token), special=True) + + for i in range(2, self.offset): + _added_tokens_decoder[len(_added_tokens_decoder)] = AddedToken(f"", special=True) + + # Force update as we want to make sure vocab is enforced (same as fast) + self._added_tokens_decoder = kwargs.pop("added_tokens_decoder", {}) + self._added_tokens_decoder.update(_added_tokens_decoder) + + super().__init__( + eos_token=eos_token, + unk_token=unk_token, + mask_token=mask_token, + pad_token=pad_token, + mask_token_sent=mask_token_sent, + offset=offset, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self) -> int: + return len(self.sp_model) + self.offset + + def get_vocab(self) -> dict[str, int]: + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def _tokenize(self, text: str) -> list[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token: str) -> int: + """Converts a token (str) to an id using the vocab.""" + sp_id = self.sp_model.piece_to_id(token) + return sp_id + self.offset + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) to a token (str) using the vocab.""" + if index < self.offset: + return self.sp_model.IdToPiece(index) + token = self.sp_model.IdToPiece(index - self.offset) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + out_string += self.sp_model.decode(current_sub_tokens) + token + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def num_special_tokens_to_add(self, pair=False): + """Just EOS""" + return 1 + + def _special_token_mask(self, seq): + all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp + all_special_ids.remove(self.unk_token_id) # is only sometimes special + + return [1 if x in all_special_ids else 0 for x in seq] + + def get_special_tokens_mask( + self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """Get list where entries are [1] if a token is [eos] or [pad] else 0.""" + if already_has_special_tokens: + return self._special_token_mask(token_ids_0) + elif token_ids_1 is None: + return self._special_token_mask(token_ids_0) + [1] + else: + return self._special_token_mask(token_ids_0 + token_ids_1) + [1] + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating + and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence: + + - single sequence: `X ` + - pair of sequences: `A B ` (not intended use) + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + [self.eos_token_id] + # We don't expect to process pairs, but leave the pair logic for API consistency + return token_ids_0 + token_ids_1 + [self.eos_token_id] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + +__all__ = ["PegasusTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..92a37c44ff2e302fe1e2a56849f2e91fa481ec9b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus_fast.py @@ -0,0 +1,215 @@ +# coding=utf-8 +# Copyright 2020 Google and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for model PEGASUS.""" + +import os +from shutil import copyfile +from typing import Optional + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_pegasus import PegasusTokenizer +else: + PegasusTokenizer = None + + +logger = logging.get_logger(__name__) + + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + + +class PegasusTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking single token values. This is the token used when training this model with masked + language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining. + It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive + Summarization](https://huggingface.co/papers/1912.08777). + mask_token_sent (`str`, *optional*, defaults to `""`): + The token used for masking whole target sentences. This is the token used when training this model with gap + sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during + pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for + Abstractive Summarization](https://huggingface.co/papers/1912.08777). + additional_special_tokens (`List[str]`, *optional*): + Additional special tokens used by the tokenizer. If no additional_special_tokens are provided and + are used as additional special tokens corresponding to the [original PEGASUS + tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66) + that uses the tokens 2 - 104 only for pretraining + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = PegasusTokenizer + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + pad_token="", + eos_token="", + unk_token="", + mask_token="", + mask_token_sent="", + additional_special_tokens=None, + offset=103, # entries 2 - 104 are only used for pretraining + **kwargs, + ): + self.offset = offset + + if additional_special_tokens is not None: + if not isinstance(additional_special_tokens, list): + raise TypeError( + f"additional_special_tokens should be of type {type(list)}, but is" + f" {type(additional_special_tokens)}" + ) + + additional_special_tokens_extended = ( + ([mask_token_sent] + additional_special_tokens) + if mask_token_sent not in additional_special_tokens and mask_token_sent is not None + else additional_special_tokens + ) + # fill additional tokens with ..., in case not all additional tokens are already taken + additional_special_tokens_extended += [ + f"" for i in range(len(additional_special_tokens_extended), self.offset - 1) + ] + + if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended): + raise ValueError( + "Please make sure that the provided additional_special_tokens do not contain an incorrectly" + f" shifted list of tokens. Found {additional_special_tokens_extended}." + ) + additional_special_tokens = additional_special_tokens_extended + else: + additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else [] + additional_special_tokens += [f"" for i in range(2, self.offset)] + + # pegasus was design to support changing the index of the first tokens. If one of the padding/eos/unk/mask token + # is different from default, we must rebuild the vocab + from_slow = kwargs.pop("from_slow", None) + from_slow = from_slow or str(pad_token) != "" or str(eos_token) != "" or str(unk_token) != "" + + kwargs.pop("added_tokens_decoder", {}) + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + pad_token=pad_token, + eos_token=eos_token, + unk_token=unk_token, + mask_token=mask_token, + mask_token_sent=mask_token_sent, + offset=offset, + additional_special_tokens=additional_special_tokens, + from_slow=from_slow, + **kwargs, + ) + self.vocab_file = vocab_file + + def _special_token_mask(self, seq): + all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp + all_special_ids.remove(self.unk_token_id) # is only sometimes special + + if all_special_ids != set(range(len(self.additional_special_tokens) + 3)): + raise ValueError( + "There should be 3 special tokens: mask_token, pad_token, and eos_token +" + f" {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}" + ) + + return [1 if x in all_special_ids else 0 for x in seq] + + def get_special_tokens_mask( + self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """Get list where entries are [1] if a token is [eos] or [pad] else 0.""" + if already_has_special_tokens: + return self._special_token_mask(token_ids_0) + elif token_ids_1 is None: + return self._special_token_mask(token_ids_0) + [1] + else: + return self._special_token_mask(token_ids_0 + token_ids_1) + [1] + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]: + """ + Build model inputs from a sequence by adding eos to the end. no bos token is added to the front. + + - single sequence: `X ` + - pair of sequences: `A B ` (not intended use) + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + [self.eos_token_id] + # We don't expect to process pairs, but leave the pair logic for API consistency + return token_ids_0 + token_ids_1 + [self.eos_token_id] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["PegasusTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..81c3ba93bcf470394a99309dbf2f9869cd68d887 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_perception_lm import * + from .image_processing_perception_lm_fast import * + from .modeling_perception_lm import * + from .processing_perception_lm import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e80bee88e51a38c7cb3fedd1ae176619f2f6a20 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/configuration_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/configuration_perception_lm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bc223af4874c3dbc78c19a72540faf51a61bf8a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/configuration_perception_lm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/image_processing_perception_lm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/image_processing_perception_lm_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e421aa482e683b7fc6459504c5b4f3a478a576b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/image_processing_perception_lm_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modeling_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modeling_perception_lm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99e8d01b648d207e6486c400ecdce3c65a99e05a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modeling_perception_lm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modular_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modular_perception_lm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..135fcea17e68a13d3cda7f6c6f56bfb510e52bbe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modular_perception_lm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/processing_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/processing_perception_lm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..703e4ea0ed6dc5522d0c651bd4222582e67449dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/processing_perception_lm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/video_processing_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/video_processing_perception_lm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5262a0bc7a3ea9f03b22d373ef92e7730502f319 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/video_processing_perception_lm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/configuration_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/configuration_perception_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..08c084065ff86a519007afc9d6a1d97eacf6d381 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/configuration_perception_lm.py @@ -0,0 +1,88 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PerceptionLM model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig +from ..timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig + + +logger = logging.get_logger(__name__) + + +class PerceptionLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an + PerceptionLM model according to the specified arguments, defining the model architecture. + + Example models: + - [facebook/Perception-LM-1B](https://huggingface.co/facebook/Perception-LM-1B). + - [facebook/Perception-LM-3B](https://huggingface.co/facebook/Perception-LM-3B). + - [facebook/Perception-LM-8B](https://huggingface.co/facebook/Perception-LM-8B). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[TimmWrapperConfig, dict]`, *optional*, defaults to `TimmWrapperConfig()`): + The config object or dictionary of the vision backbone. + text_config (`Union[PretrainedConfig, dict]`, *optional*, defaults to `LlamaConfig()`): + The config object or dictionary of the text backbone. + vision_use_cls_token (`bool`, *optional*, defaults to `True`): + Whether CLS token is used in the vision backbone. If used, we remove CLS token embedding from vision output. + projector_pooling_ratio (`int`, *optional*, defaults to 1): + The pooling ratio used in the multimodal projector. + image_token_id (`int`, *optional*, defaults to 128002): + The image token index to encode the image prompt. + video_token_id (`int`, *optional*, defaults to 128003): + The video token index to encode the video prompt. + """ + + model_type = "perception_lm" + sub_configs = {"text_config": AutoConfig, "vision_config": TimmWrapperConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + vision_use_cls_token=True, + projector_pooling_ratio=1, + image_token_id=128002, + video_token_id=128003, + **kwargs, + ): + self.image_token_id = image_token_id + self.video_token_id = video_token_id + if isinstance(vision_config, dict): + vision_config = TimmWrapperConfig(**vision_config) + elif isinstance(vision_config, TimmWrapperConfig): + pass + elif vision_config is None: + vision_config = TimmWrapperConfig() + self.vision_config = vision_config + self.vision_use_cls_token = vision_use_cls_token + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["llama"]() + + self.text_config = text_config + self.projector_pooling_ratio = projector_pooling_ratio + super().__init__(**kwargs) + + +__all__ = ["PerceptionLMConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/image_processing_perception_lm_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c26132a484397067ad7354ea50705e493f063ff1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/image_processing_perception_lm_fast.py @@ -0,0 +1,309 @@ +# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for PerceptionLM.""" + +import math +from functools import reduce +from typing import Optional, Union + +import numpy as np +import torch +from torchvision.transforms import functional as F + +from ...image_processing_utils import ( + BatchFeature, +) +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + get_image_size, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + PILImageResampling, +) +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring + + +class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`): + Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for + multi-scale processing, otherwise uses single tile for lower memory usage. + tile_size (`int`, *optional*, defaults to `448`): + Height and width dimension (in pixels) of each tile used for image processing. + max_num_tiles (`int`, *optional*, defaults to `36`): + Maximum number of tiles an image can be split into based on its aspect ratio. + """ + + vision_input_type: str = "thumb+tile" + tile_size: int = 448 + max_num_tiles: int = 36 + + +@auto_docstring +class PerceptionLMImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + do_resize = True + do_center_crop = False + do_rescale = True + do_normalize = True + do_convert_rgb = True + size = {"width": 448, "height": 448} # for backward compatibility in tests + valid_kwargs = PerceptionLMFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None: + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + @staticmethod + def _factors(n: int): + """Return all factors of a number.""" + return set( + reduce( + list.__add__, + ([i, n // i] for i in range(1, int(n**0.5) + 1) if n % i == 0), + ) + ) + + def _find_supported_aspect_ratios(self): + """ + This function computes all the allowed aspect ratios for a fixed + number of input chunks. The order of returned items matters for the result of `_fit_image_to_canvas` function. + If tie exists in `_fit_image_to_canvas`, the latter in `_find_supported_aspect_ratios` wins. + + For example, with `num_tiles=5`, it will return: + { + 0.2: [(1, 5)], + 5.0: [(5, 1)], + 0.25: [(1, 4)], + 1.0: [(2, 2), (1, 1)], + 4.0: [(4, 1)], + 0.3333333333333333: [(1, 3)], + 3.0: [(3, 1)], + 0.5: [(1, 2)], + 2.0: [(2, 1)] + } + """ + asp_dict = {} + for chunk_size in range(self.max_num_tiles, 0, -1): + _factors = sorted(self._factors(chunk_size)) + _asp_ratios = [(x, chunk_size // x) for x in _factors] + for ratio in _asp_ratios: + k = ratio[0] / ratio[1] + if k not in asp_dict: + asp_dict[k] = [ratio] + else: + asp_dict[k].append(ratio) + return asp_dict + + def _get_image_height_width( + self, image_width: int, image_height: int, target_width: int, target_height: int + ) -> tuple[int, int]: + """ + Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized + with aspect ratio preservation. + """ + scale = image_width / image_height + + if scale > 1.0: + # Width is larger than height + + # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas. + rescaling_factor = min(target_width / image_width, target_height / image_height) + + # Set new width to target width and height to the rescaled height. + new_w = rescaling_factor * image_width + new_h = math.floor(new_w / scale) + + else: + # Height is larger than width + + # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas. + rescaling_factor = min(target_width / image_width, target_height / image_height) + + # Set new height to target height and width to the rescaled width. + new_h = rescaling_factor * image_height + new_w = math.floor(new_h * scale) + + return new_w, new_h + + def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int): + """ + Given an image width, height and target number of chunks this function will see if the image + can be fit into any of the canvases that can be build from arranging the tiles in a grid. + If the image can be fit onto several canvases, it will return the canvas where the shorter edge + of the image will be largest. + """ + # Initialize the optimal canvas to None. If no canvas is found where image fits, function returns None. + optimal_canvas = None + optimal_image_width_height = None + + scale = img_width / img_height + + # Gather all potential supported image resolutions and iterate through them to find best match + potential_arrangements = [ + item for sublist in self._find_supported_aspect_ratios().values() for item in sublist + ] + for n_w, n_h in potential_arrangements: + # Compute the canvas size + canvas_width, canvas_height = n_w * tile_size, n_h * tile_size + + # Check if image can fit into the canvas without downsampling + if canvas_width >= img_width and canvas_height >= img_height: + # If we did not find a good canvas yet, we will use the current one + if optimal_canvas is None: + # Set optimal canvas and determine the actual image height and width in the canvas with aspect ratio preserving resampling + optimal_canvas = (n_w, n_h) + optimal_image_width_height = self._get_image_height_width( + image_width=img_width, + image_height=img_height, + target_width=n_w * tile_size, + target_height=n_h * tile_size, + ) + else: + # If we already found an optimal canvas before, we will check if the shorter edge of the image will be larger than the current optimal canvas. + # This means we can potentially upsample the image resolution which is beneficial to performance. + image_width_height = self._get_image_height_width( + image_width=img_width, + image_height=img_height, + target_width=n_w * tile_size, + target_height=n_h * tile_size, + ) + # Llama3V dynamic tiling. Prioritize biggest canvas. + if (scale < 1.0 and (image_width_height[0] >= optimal_image_width_height[0])) or ( + scale >= 1.0 and (image_width_height[1] >= optimal_image_width_height[1]) + ): + optimal_canvas = (n_w, n_h) + optimal_image_width_height = image_width_height + return optimal_canvas + + def _find_closest_aspect_ratio(self, img_width: int, img_height: int, tile_size: int) -> tuple: + """ + Given an image width, height and target number of chunks + this function will find the closest supported aspect ratio. + """ + target_aspect_ratio = img_width / img_height + asp_dict = self._find_supported_aspect_ratios() + closest_aspect_ratio = None + if target_aspect_ratio >= 1: + closest_aspect_ratio = min( + [k for k in asp_dict if k <= target_aspect_ratio], + key=lambda x: abs(x - target_aspect_ratio), + ) + tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio] + # select largest width + return max(tiles_given_aspect_ratio, key=lambda x: x[0]) + else: + closest_aspect_ratio = min( + [k for k in asp_dict if k > target_aspect_ratio], + key=lambda x: abs(1 / x - 1 / target_aspect_ratio), + ) + tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio] + # select largest height + return max(tiles_given_aspect_ratio, key=lambda x: x[1]) + + def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor: + # Split image into number of required tiles (width x height) + batch_size, num_channels, height, width = image.size() + image = image.view(batch_size, num_channels, nch, height // nch, ncw, width // ncw) + # Permute dimensions to reorder the axes + image = image.permute(0, 2, 4, 1, 3, 5).contiguous() + # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2) + image = image.view(batch_size, ncw * nch, num_channels, height // nch, width // ncw) + return image + + def resize( + self, + image: np.ndarray, + tile_size: int, + max_num_tiles: int, + resample: PILImageResampling = PILImageResampling.BICUBIC, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + height, width = get_image_size(image, channel_dim=input_data_format) + if max_num_tiles > 1: + aspect_ratio = self._fit_image_to_canvas(img_width=width, img_height=height, tile_size=tile_size) + if aspect_ratio is None: + # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image + aspect_ratio = self._find_closest_aspect_ratio(img_width=width, img_height=height, tile_size=tile_size) + else: + aspect_ratio = (1, 1) + new_width, new_height = aspect_ratio[0] * tile_size, aspect_ratio[1] * tile_size + image = F.resize(image, (new_height, new_width), interpolation=resample) + return image, aspect_ratio + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + do_rescale: Optional[bool], + rescale_factor: Optional[Union[int, float]], + do_normalize: Optional[bool], + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + vision_input_type: str, + tile_size: int, + max_num_tiles: int, + return_tensors: Optional[Union[str, TensorType]], + disable_grouping: bool, + **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs], + ) -> BatchFeature: + # Group images by size for batched transformation + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + if vision_input_type == "thumb+tile": + thumbnails, _ = self.resize(stacked_images, tile_size, max_num_tiles=1) + images_for_tiling, (tiles_w, tiles_h) = self.resize( + stacked_images, tile_size, max_num_tiles=max_num_tiles + ) + image_tiles = self._split(images_for_tiling, tiles_w, tiles_h) + stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1) + else: # vanilla single tile for low memory devices + stacked_images, _ = self.resize(stacked_images, tile_size, max_num_tiles=1) + + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, + do_rescale, + rescale_factor, + do_normalize, + image_mean, + image_std, + ) + processed_images_grouped[shape] = stacked_images + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = [p[None] if p.ndim == 3 else p for p in processed_images] # add tiles dimension if needed + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + +__all__ = ["PerceptionLMImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modeling_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modeling_perception_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..074e91e14e88d29296e40f8a3f5c94663221997d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modeling_perception_lm.py @@ -0,0 +1,487 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/perception_lm/modular_perception_lm.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_perception_lm.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, can_return_tuple +from ..auto import AutoModel +from .configuration_perception_lm import PerceptionLMConfig + + +class PerceptionLMAdaptiveAvgPooling(nn.Module): + def __init__(self, pooling_ratio=2): + super().__init__() + self.pooling_ratio = pooling_ratio + + def forward(self, hidden_states): + b, num_tokens, c = hidden_states.shape + h = int(math.sqrt(num_tokens)) + if h * h != num_tokens: + raise ValueError(f"num_tokens {num_tokens} is expected to be a square number") + + shape = (h // self.pooling_ratio, h // self.pooling_ratio) + hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h) + hidden_states = F.adaptive_avg_pool2d(hidden_states, shape) + hidden_states = hidden_states.flatten(2).transpose(1, 2) + + return hidden_states + + +class PerceptionLMMultiModalProjector(nn.Module): + def __init__(self, config: PerceptionLMConfig): + super().__init__() + input_size = config.vision_config.model_args["embed_dim"] + output_size = config.text_config.hidden_size + self.linear_1 = nn.Linear( + in_features=input_size, + out_features=output_size, + bias=True, + ) + self.gelu = nn.GELU() + self.linear_2 = nn.Linear( + in_features=output_size, + out_features=output_size, + bias=True, + ) + self.pooling = ( + PerceptionLMAdaptiveAvgPooling(config.projector_pooling_ratio) + if config.projector_pooling_ratio > 1 + else nn.Identity() + ) + + def forward(self, features): + features = features.permute(1, 0, 2) # NLD -> LND + features = self.linear_1(features) + features = self.gelu(features) + features = self.linear_2(features) + features = features.permute(1, 0, 2) # LND -> NLD + features = self.pooling(features) + return features + + +@auto_docstring +class PerceptionLMPreTrainedModel(PreTrainedModel): + config: PerceptionLMConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_flex_attn = True + _supports_attention_backend = True + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for PerceptionLM outputs, with hidden states and attentions. + """ +) +class PerceptionLMModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`. + Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + video_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for PerceptionLM causal language model (or autoregressive) outputs. + """ +) +class PerceptionLMCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`. + Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + video_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring +class PerceptionLMModel(PerceptionLMPreTrainedModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: PerceptionLMConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = PerceptionLMMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_tiles, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`). + """ + image_outputs = self.vision_tower(pixel_values.flatten(0, 1)) + image_outputs = image_outputs.last_hidden_state + if self.config.vision_use_cls_token: + image_outputs = image_outputs[:, 1:, :] + image_features = self.multi_modal_projector(image_outputs) + return image_features + + def get_placeholder_mask( + self, + input_ids: torch.LongTensor, + inputs_embeds: torch.FloatTensor, + image_features: Optional[torch.FloatTensor] = None, + video_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_video_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_video_mask = special_video_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + special_video_mask = input_ids == self.config.video_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.size()[:-1].numel()}" + ) + + n_video_tokens = special_video_mask.sum() + special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel(): + raise ValueError( + f"Videos features and image tokens do not match: tokens: {n_video_tokens}, features {video_features.size()[:-1].numel()}" + ) + + return special_image_mask, special_video_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, + ) -> Union[tuple, PerceptionLMModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: + raise ValueError( + "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one" + ) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + image_features = None + if pixel_values is not None: + image_features = self.get_image_features(pixel_values=pixel_values) + image_features = image_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype) + special_image_mask, _ = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + video_features = None + if pixel_values_videos is not None: + video_features = self.get_image_features(pixel_values=pixel_values_videos) + video_features = video_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype) + _, special_video_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, video_features=video_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **lm_kwargs, + ) + return PerceptionLMModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + past_key_values=outputs.past_key_values, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + video_hidden_states=(video_features if pixel_values_videos is not None else None), + ) + + +@auto_docstring +class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: PerceptionLMConfig): + super().__init__(config) + self.model = PerceptionLMModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, + ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + from transformers import AutoProcessor, AutoModelForImageTextToText + from huggingface_hub import hf_hub_download + + MODEL_PATH = "facebook/Perception-LM-1B" + processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True) + model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda") + test_image_file = hf_hub_download( + repo_id="shumingh/perception_lm_test_images", + filename="14496_0.PNG", + repo_type="dataset", + ) + conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": test_image_file, + }, + {"type": "text", "text": "Describe the bar plot in the image."}, + ], + } + ] + + inputs = processor.apply_chat_template( + [conversation], + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + inputs = inputs.to(model.device) + generate_ids = model.generate(**inputs, max_new_tokens=256) + input_length = inputs["input_ids"].shape[1] + generate_ids_without_inputs = generate_ids[:, input_length:] + + for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True): + print(output) + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **lm_kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.text_config.vocab_size, + **lm_kwargs, + ) + + return PerceptionLMCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + video_hidden_states=outputs.video_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_values_videos=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_values_videos"] = pixel_values_videos + return model_inputs + + +__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel", "PerceptionLMModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modular_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modular_perception_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..2b50b824220210bebb65f5c27b206b8d8e101f84 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modular_perception_lm.py @@ -0,0 +1,441 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch PerceptionLM model.""" + +import math +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...cache_utils import Cache +from ...utils import ( + auto_docstring, + can_return_tuple, + logging, +) +from ..auto import AutoModel +from ..llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + LlavaModel, + LlavaModelOutputWithPast, + LlavaPreTrainedModel, +) +from .configuration_perception_lm import PerceptionLMConfig + + +logger = logging.get_logger(__name__) + + +class PerceptionLMAdaptiveAvgPooling(nn.Module): + def __init__(self, pooling_ratio=2): + super().__init__() + self.pooling_ratio = pooling_ratio + + def forward(self, hidden_states): + b, num_tokens, c = hidden_states.shape + h = int(math.sqrt(num_tokens)) + if h * h != num_tokens: + raise ValueError(f"num_tokens {num_tokens} is expected to be a square number") + + shape = (h // self.pooling_ratio, h // self.pooling_ratio) + hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h) + hidden_states = F.adaptive_avg_pool2d(hidden_states, shape) + hidden_states = hidden_states.flatten(2).transpose(1, 2) + + return hidden_states + + +class PerceptionLMMultiModalProjector(nn.Module): + def __init__(self, config: PerceptionLMConfig): + super().__init__() + input_size = config.vision_config.model_args["embed_dim"] + output_size = config.text_config.hidden_size + self.linear_1 = nn.Linear( + in_features=input_size, + out_features=output_size, + bias=True, + ) + self.gelu = nn.GELU() + self.linear_2 = nn.Linear( + in_features=output_size, + out_features=output_size, + bias=True, + ) + self.pooling = ( + PerceptionLMAdaptiveAvgPooling(config.projector_pooling_ratio) + if config.projector_pooling_ratio > 1 + else nn.Identity() + ) + + def forward(self, features): + features = features.permute(1, 0, 2) # NLD -> LND + features = self.linear_1(features) + features = self.gelu(features) + features = self.linear_2(features) + features = features.permute(1, 0, 2) # LND -> NLD + features = self.pooling(features) + return features + + +class PerceptionLMPreTrainedModel(LlavaPreTrainedModel): + base_model_prefix = "model" + + +class PerceptionLMModelOutputWithPast(LlavaModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`. + Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + video_hidden_states: Optional[torch.FloatTensor] = None + + +class PerceptionLMCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + video_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`. + Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + video_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring +class PerceptionLMModel(LlavaModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: PerceptionLMConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = PerceptionLMMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_tiles, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`). + """ + image_outputs = self.vision_tower(pixel_values.flatten(0, 1)) + image_outputs = image_outputs.last_hidden_state + if self.config.vision_use_cls_token: + image_outputs = image_outputs[:, 1:, :] + image_features = self.multi_modal_projector(image_outputs) + return image_features + + def get_placeholder_mask( + self, + input_ids: torch.LongTensor, + inputs_embeds: torch.FloatTensor, + image_features: Optional[torch.FloatTensor] = None, + video_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_video_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_video_mask = special_video_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + special_video_mask = input_ids == self.config.video_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.size()[:-1].numel()}" + ) + + n_video_tokens = special_video_mask.sum() + special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel(): + raise ValueError( + f"Videos features and image tokens do not match: tokens: {n_video_tokens}, features {video_features.size()[:-1].numel()}" + ) + + return special_image_mask, special_video_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, + ) -> Union[tuple, PerceptionLMModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: + raise ValueError( + "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one" + ) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + image_features = None + if pixel_values is not None: + image_features = self.get_image_features(pixel_values=pixel_values) + image_features = image_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype) + special_image_mask, _ = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + video_features = None + if pixel_values_videos is not None: + video_features = self.get_image_features(pixel_values=pixel_values_videos) + video_features = video_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype) + _, special_video_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, video_features=video_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **lm_kwargs, + ) + return PerceptionLMModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + past_key_values=outputs.past_key_values, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + video_hidden_states=(video_features if pixel_values_videos is not None else None), + ) + + +@auto_docstring +class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration): + _checkpoint_conversion_mapping = {} + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_values_videos=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_values_videos"] = pixel_values_videos + return model_inputs + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, + ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + from transformers import AutoProcessor, AutoModelForImageTextToText + from huggingface_hub import hf_hub_download + + MODEL_PATH = "facebook/Perception-LM-1B" + processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True) + model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda") + test_image_file = hf_hub_download( + repo_id="shumingh/perception_lm_test_images", + filename="14496_0.PNG", + repo_type="dataset", + ) + conversation = [ + { + "role": "user", + "content": [ + { + "type": "image", + "url": test_image_file, + }, + {"type": "text", "text": "Describe the bar plot in the image."}, + ], + } + ] + + inputs = processor.apply_chat_template( + [conversation], + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + inputs = inputs.to(model.device) + generate_ids = model.generate(**inputs, max_new_tokens=256) + input_length = inputs["input_ids"].shape[1] + generate_ids_without_inputs = generate_ids[:, input_length:] + + for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True): + print(output) + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **lm_kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + vocab_size=self.config.text_config.vocab_size, + **lm_kwargs, + ) + + return PerceptionLMCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + video_hidden_states=outputs.video_hidden_states, + ) + + def get_image_features(self, **kwargs): + raise AttributeError("Not needed for PerceptionLM") + + def language_model(self): + raise AttributeError("Not needed for PerceptionLM") + + def vision_tower(self): + raise AttributeError("Not needed for PerceptionLM") + + def multi_modal_projector(self): + raise AttributeError("Not needed for PerceptionLM") + + +__all__ = [ + "PerceptionLMForConditionalGeneration", + "PerceptionLMPreTrainedModel", + "PerceptionLMModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/processing_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/processing_perception_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..f61c54554d32c22b4f22ae8905233405c828f3da --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/processing_perception_lm.py @@ -0,0 +1,244 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for PerceptionLM. +""" + +from collections.abc import Iterable +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, get_image_size, to_numpy_array +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging +from ...video_utils import VideoInput + + +logger = logging.get_logger(__name__) + + +class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + "return_mm_token_type_ids": False, + }, + } + + +class PerceptionLMProcessor(ProcessorMixin): + r""" + Constructs a PerceptionLM processor which wraps a PerceptionLM image processor, a PerceptionLM video processor, and a tokenizer into a single processor. + + [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessorFast`], [`PerceptionLMVideoProcessor`], and the tokenizer (e.g. [`LlamaTokenizerFast`]). See the + [`~PerceptionLMProcessor.__call__`] and [`~PerceptionLMProcessor.decode`] for more information. + + Args: + video_processor ([`PerceptionLMVideoProcessor`], *optional*): + The video processor to process video inputs. + image_processor ([`PerceptionLMImageProcessorFast`], *optional*): + The image processor to process image inputs. + tokenizer ([`LlamaTokenizerFast`] or similar, *optional*): + The tokenizer to process text inputs. + patch_size (`int`, *optional*): + Patch size from the vision tower. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + pooling_ratio (`int`, *optional*, defaults to 2): + Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens. + """ + + attributes = ["video_processor", "image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + video_processor_class = "AutoVideoProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + video_processor=None, + image_processor=None, + tokenizer=None, + patch_size=None, + chat_template=None, + pooling_ratio=2, + **kwargs, + ): + self.patch_size = patch_size + self.pooling_ratio = pooling_ratio + self.image_token = tokenizer.image_token + self.video_token = tokenizer.video_token + self.image_token_id = tokenizer.image_token_id + self.video_token_id = tokenizer.video_token_id + super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos: Optional[VideoInput] = None, + **kwargs: Unpack[PerceptionLMProcessorKwargs], + ) -> BatchFeature: + """ + Prepares a batch containing one or more sequences of text and/or images and/or videos. + + If `text` is provided, it is tokenized using the tokenizer. + If `images` is provided, they are processed using the image processor. + If `videos` is provided, they are processed using the video processor. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): + The image or batch of images to be processed. Each image can be a PIL image, NumPy array, or PyTorch tensor. + Both channels-first and channels-last formats are supported. + text (`str`, `List[str]`, *optional*): + The sequence or batch of sequences to be tokenized. Each sequence can be a string. + videos (`Any`, *optional*): + The video or batch of videos to be processed. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is provided. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is provided). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is provided. + - **pixel_values_videos** -- Video pixel values to be fed to a model. Returned when `videos` is provided. + """ + if text is None: + raise ValueError( + "You have to specify at least `text` input. Optionally, you can also specify `images` or `videos`." + ) + + output_kwargs = self._merge_kwargs( + PerceptionLMProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + else: + image_inputs = {} + + if videos is not None: + videos_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"]) + else: + videos_inputs = {} + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + # try to expand inputs in processing if we have the necessary parts + prompt_strings = [] + + pixel_values = iter(image_inputs.get("pixel_values", [])) + pixel_values_videos = iter(videos_inputs.get("pixel_values_videos", [])) + for sample in text: + # Replace the media token with the expanded media token sequence + sample = self._expand_media_tokens(sample, self.tokenizer.image_token, pixel_values) + sample = self._expand_media_tokens(sample, self.tokenizer.video_token, pixel_values_videos) + prompt_strings.append(sample) + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) + + def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable): + media_count = sample.count(media_token) + if media_count > 0: + media_list = [next(media_iter) for _ in range(media_count)] + sample_splits = sample.split(media_token) + media_token_list = [] + for media in media_list: + height, width = get_image_size(to_numpy_array(media)) + num_tiles = media.shape[0] + num_media_tokens = ( + (height // self.patch_size // self.pooling_ratio) + * (width // self.patch_size // self.pooling_ratio) + * num_tiles + ) + media_token_list.append(num_media_tokens) + sample = "" + for i, num_media_tokens in enumerate(media_token_list): + sample += sample_splits[i] + sample += media_token * num_media_tokens + sample += sample_splits[-1] + return sample + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = PerceptionLMProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + tile_size = images_kwargs.get("tile_size", None) or self.image_processor.tile_size + vision_input_type = images_kwargs.get("vision_input_type", None) or self.image_processor.vision_input_type + + num_image_tokens = [] + num_image_patches = [] + for height, width in image_sizes: + if vision_input_type == "thumb+tile": + aspect_ratio = self.image_processor._fit_image_to_canvas( + img_width=width, img_height=height, tile_size=tile_size + ) + if aspect_ratio is None: + aspect_ratio = self.image_processor._find_closest_aspect_ratio( + img_width=width, img_height=height, tile_size=tile_size + ) + num_tiles = aspect_ratio[0] * aspect_ratio[1] + 1 # base image and tiles + else: + num_tiles = 1 + + num_image_tokens.append( + (tile_size // self.patch_size // self.pooling_ratio) + * (tile_size // self.patch_size // self.pooling_ratio) + * num_tiles + ) + num_image_patches.append(num_tiles) + + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + return MultiModalData(**vision_data) + + +__all__ = ["PerceptionLMProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/video_processing_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/video_processing_perception_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..1023aa7c589dab5b249c6dff3d61e4d6e523eda8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/video_processing_perception_lm.py @@ -0,0 +1,41 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Video processor class for PerceptionLM.""" + +from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling +from ...processing_utils import Unpack, VideosKwargs +from ...video_processing_utils import BaseVideoProcessor + + +class PerceptionLMFastVideoProcessorInitKwargs(VideosKwargs): ... + + +class PerceptionLMVideoProcessor(BaseVideoProcessor): + resample = PILImageResampling.BICUBIC + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 448, "width": 448} + do_resize = True + do_center_crop = False + do_rescale = True + do_normalize = True + do_convert_rgb = True + valid_kwargs = PerceptionLMFastVideoProcessorInitKwargs + model_input_names = ["pixel_values_videos"] + + def __init__(self, **kwargs: Unpack[PerceptionLMFastVideoProcessorInitKwargs]): + super().__init__(**kwargs) + + +__all__ = ["PerceptionLMVideoProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3de7bff9396f3676b4f750cc9b14a5a9d5af039d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/configuration_persimmon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/configuration_persimmon.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5dcdbfee5f9118482f6a0981abb4277dd7cfb2a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/configuration_persimmon.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/modeling_persimmon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/modeling_persimmon.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..994952322388b60ce4e78a7bc1881827bf6ac3a2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/modeling_persimmon.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35779de1410250d586f01b35826ce2d7f8d6850d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16869b0b233c07c42e94d8e2825d095fc4a35fd1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d9b62db832f30517530cd9945494e1ecb05e014 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4280a842d28ae6544c2cb047ba9f9813e390c31 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ffc31db387bfce33d61444c70bc711ba18e4e7b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/configuration_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/configuration_phi4_multimodal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6762407a6389a521407275b1f31ac6f50c39ffc3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/configuration_phi4_multimodal.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/feature_extraction_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/feature_extraction_phi4_multimodal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e3a0b47843c5317030f00597c5154c7d1c34dd9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/feature_extraction_phi4_multimodal.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/image_processing_phi4_multimodal_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/image_processing_phi4_multimodal_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..669ad1f83d3bd13674775a7cf665d6d28080a059 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/image_processing_phi4_multimodal_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/modular_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/modular_phi4_multimodal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f600cd1a810eb9f06962c83be4cbda823fcc6e64 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/modular_phi4_multimodal.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/processing_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/processing_phi4_multimodal.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bae7e9c1b21f4aa96c983ab8cdd77372ed91f92 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/processing_phi4_multimodal.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..218eaf015bcef9db6be649129a065218f5506db1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/tokenization_phobert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/tokenization_phobert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ebcc8bcd1139c21e41b93df6e697839a995e93d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/tokenization_phobert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9603e1080f397e067683ea9c9ee42fe8d14d3f6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/configuration_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/configuration_pixtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9c863307d823e100104587bb1b0566e855d96c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/configuration_pixtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc1daaa741c3ee32e9bcd09caf6b1015fcacdb01 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..036b948a70a8e40329a03a506e2f84d500b634da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/modeling_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/modeling_pixtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17d04f1cb06895bb149564d56e9907571cfa56f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/modeling_pixtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/processing_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/processing_pixtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..419e3e153cb4454c0161f443015d4a37b1bd02f0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/processing_pixtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea5914c13d396d7a49eafd15358e8a379dad079c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/configuration_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/configuration_plbart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4041b64275c71241f1a77c6c6f69af2818d7ef74 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/configuration_plbart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modeling_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modeling_plbart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3cc5c70274ebf5e154cdec74ce28caa5dd30ce5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modeling_plbart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modular_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modular_plbart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f897c2dfcdca4d00cdce2262589d8d72d6c1add5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modular_plbart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/tokenization_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/tokenization_plbart.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5d45053d0d270d3747906b415ba5f40be29ddd9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/tokenization_plbart.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0287444a0b0511b4f3155c47ca2a7c4444f1a3d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/configuration_pvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/configuration_pvt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88718d88c37fd5926d3bc337a4a5bf196f287aa3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/configuration_pvt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..572a30e376d2b792c385b448cd9abf446789c26a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..230fba38a0be29110a166b3ee8e22489ded1d67b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/modeling_pvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/modeling_pvt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2968be8fd6262e0a68a28a403124c30b6e5730a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/modeling_pvt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/configuration_pvt_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/configuration_pvt_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c79541e8f178c120225b56727757e30be575d9db Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/configuration_pvt_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a46fde7375f4705708d890524357981a661ecba2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/configuration_qwen2_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/configuration_qwen2_audio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11e25c79e30334e8e668f12373d30d4a172fc2f7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/configuration_qwen2_audio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/modeling_qwen2_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/modeling_qwen2_audio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a7a3f2f4dcb370238d122b0dc9040061a306933 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/modeling_qwen2_audio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/processing_qwen2_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/processing_qwen2_audio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30f22e32b420be13264d18225e6dbd56e89acbdc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/processing_qwen2_audio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c206b7418327caa614e4d2a792463c4e7f65ec2d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cbd02d40d3805f10bde801ff4bc96206bf4c02f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..287b1ffc9ce0223fac3969b16943356f7e744296 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fac8e688842827a70932e9715c30c935fe31cdfd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddc1c48f33740c1b197c183d64ae8861ff77391a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/configuration_qwen3_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/configuration_qwen3_next.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7a71e423195a5a3436aec73ac163e175bc15ff3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/configuration_qwen3_next.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modeling_qwen3_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modeling_qwen3_next.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af30c5b5b2f7cf704af18129bb3ed8b10d8ec461 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modeling_qwen3_next.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modular_qwen3_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modular_qwen3_next.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cf4c1842f2f3bc36d147250a1974e2591a1cfe2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modular_qwen3_next.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..510063a436a8f06a623d01abcbb018fae38528d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/processing_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/processing_qwen3_vl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43d9c902c108f8f14de191e9b61580321bdfb291 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/processing_qwen3_vl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edb67a2531d586fbbcb6cda77b8c861a286625e9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/configuration_qwen3_vl_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/configuration_qwen3_vl_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8607078a78fbd8ecf6667028d96329c123e1f6cc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/configuration_qwen3_vl_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modeling_qwen3_vl_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modeling_qwen3_vl_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22fe30ce0af56667a7829b8f99089266dc8f3f7d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modeling_qwen3_vl_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modular_qwen3_vl_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modular_qwen3_vl_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71c34219d52ce900fc72a29097d5dbf18d7a2b0b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modular_qwen3_vl_moe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ee1a45c035034c34db8b997d55553d5c46c8e86 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81fcdffeee7b037200da156d7d6b851df7e0c930 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ccbed1c48d3b997e16121d15b0ced0b663b71a5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4ff9c1f7624a02b888acafa67dc34ae3189bb85 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b9751e47ec4df58d1d32a29b16b0f0afbb5fbb1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cd1fac5b679dc5ca527dcc0a6a45b9a873daa3dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_rt_detr import * + from .configuration_rt_detr_resnet import * + from .image_processing_rt_detr import * + from .image_processing_rt_detr_fast import * + from .modeling_rt_detr import * + from .modeling_rt_detr_resnet import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..261ffab3e8a20239b7444e4dd460d8d19e857cc5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..504bc5537a8274c8325a3c0998907420ac5cff4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f71d4c33f8dcc1f09f9b4472bc6ca8673fa27f36 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..994d4a6fd6f02a9c11817c3ad1bdcdd83a14bb24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr.py @@ -0,0 +1,372 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""RT-DETR model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING +from .configuration_rt_detr_resnet import RTDetrResNetConfig + + +logger = logging.get_logger(__name__) + + +class RTDetrConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RTDetrModel`]. It is used to instantiate a + RT-DETR model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the RT-DETR + [PekingU/rtdetr_r50vd](https://huggingface.co/PekingU/rtdetr_r50vd) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + initializer_range (`float`, *optional*, defaults to 0.01): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_bias_prior_prob (`float`, *optional*): + The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`. + If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + batch_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the batch normalization layers. + backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`): + The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`): + Whether to freeze the batch normalization layers in the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + encoder_hidden_dim (`int`, *optional*, defaults to 256): + Dimension of the layers in hybrid encoder. + encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`): + Multi level features input for encoder. + feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`): + Strides used in each feature map. + encoder_layers (`int`, *optional*, defaults to 1): + Total of layers to be used by the encoder. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (`float`, *optional*, defaults to 0.0): + The ratio for all dropout layers. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`): + Indexes of the projected layers to be used in the encoder. + positional_encoding_temperature (`int`, *optional*, defaults to 10000): + The temperature parameter used to create the positional encodings. + encoder_activation_function (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + activation_function (`str`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the general layer. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + eval_size (`tuple[int, int]`, *optional*): + Height and width used to computes the effective height and width of the position embeddings after taking + into account the stride. + normalize_before (`bool`, *optional*, defaults to `False`): + Determine whether to apply layer normalization in the transformer encoder layer before self-attention and + feed-forward modules. + hidden_expansion (`float`, *optional*, defaults to 1.0): + Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers exclude hybrid encoder. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries. + decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`): + Multi level features dimension for decoder + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + num_feature_levels (`int`, *optional*, defaults to 3): + The number of input feature levels. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_activation_function (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + num_denoising (`int`, *optional*, defaults to 100): + The total number of denoising tasks or queries to be used for contrastive denoising. + label_noise_ratio (`float`, *optional*, defaults to 0.5): + The fraction of denoising labels to which random noise should be added. + box_noise_scale (`float`, *optional*, defaults to 1.0): + Scale or magnitude of noise to be added to the bounding boxes. + learn_initial_query (`bool`, *optional*, defaults to `False`): + Indicates whether the initial query embeddings for the decoder should be learned during training + anchor_image_size (`tuple[int, int]`, *optional*): + Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied. + disable_custom_kernels (`bool`, *optional*, defaults to `True`): + Whether to disable custom kernels. + with_box_refine (`bool`, *optional*, defaults to `True`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the architecture has an encoder decoder structure. + matcher_alpha (`float`, *optional*, defaults to 0.25): + Parameter alpha used by the Hungarian Matcher. + matcher_gamma (`float`, *optional*, defaults to 2.0): + Parameter gamma used by the Hungarian Matcher. + matcher_class_cost (`float`, *optional*, defaults to 2.0): + The relative weight of the class loss used by the Hungarian Matcher. + matcher_bbox_cost (`float`, *optional*, defaults to 5.0): + The relative weight of the bounding box loss used by the Hungarian Matcher. + matcher_giou_cost (`float`, *optional*, defaults to 2.0): + The relative weight of the giou loss of used by the Hungarian Matcher. + use_focal_loss (`bool`, *optional*, defaults to `True`): + Parameter informing if focal focal should be used. + auxiliary_loss (`bool`, *optional*, defaults to `True`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + focal_loss_alpha (`float`, *optional*, defaults to 0.75): + Parameter alpha used to compute the focal loss. + focal_loss_gamma (`float`, *optional*, defaults to 2.0): + Parameter gamma used to compute the focal loss. + weight_loss_vfl (`float`, *optional*, defaults to 1.0): + Relative weight of the varifocal loss in the object detection loss. + weight_loss_bbox (`float`, *optional*, defaults to 5.0): + Relative weight of the L1 bounding box loss in the object detection loss. + weight_loss_giou (`float`, *optional*, defaults to 2.0): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.0001): + Relative classification weight of the 'no-object' class in the object detection loss. + + Examples: + + ```python + >>> from transformers import RTDetrConfig, RTDetrModel + + >>> # Initializing a RT-DETR configuration + >>> configuration = RTDetrConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = RTDetrModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "rt_detr" + layer_types = ["basic", "bottleneck"] + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + initializer_range=0.01, + initializer_bias_prior_prob=None, + layer_norm_eps=1e-5, + batch_norm_eps=1e-5, + # backbone + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + freeze_backbone_batch_norms=True, + backbone_kwargs=None, + # encoder HybridEncoder + encoder_hidden_dim=256, + encoder_in_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + encoder_layers=1, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + dropout=0.0, + activation_dropout=0.0, + encode_proj_layers=[2], + positional_encoding_temperature=10000, + encoder_activation_function="gelu", + activation_function="silu", + eval_size=None, + normalize_before=False, + hidden_expansion=1.0, + # decoder RTDetrTransformer + d_model=256, + num_queries=300, + decoder_in_channels=[256, 256, 256], + decoder_ffn_dim=1024, + num_feature_levels=3, + decoder_n_points=4, + decoder_layers=6, + decoder_attention_heads=8, + decoder_activation_function="relu", + attention_dropout=0.0, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learn_initial_query=False, + anchor_image_size=None, + disable_custom_kernels=True, + with_box_refine=True, + is_encoder_decoder=True, + # Loss + matcher_alpha=0.25, + matcher_gamma=2.0, + matcher_class_cost=2.0, + matcher_bbox_cost=5.0, + matcher_giou_cost=2.0, + use_focal_loss=True, + auxiliary_loss=True, + focal_loss_alpha=0.75, + focal_loss_gamma=2.0, + weight_loss_vfl=1.0, + weight_loss_bbox=5.0, + weight_loss_giou=2.0, + eos_coefficient=1e-4, + **kwargs, + ): + self.initializer_range = initializer_range + self.initializer_bias_prior_prob = initializer_bias_prior_prob + self.layer_norm_eps = layer_norm_eps + self.batch_norm_eps = batch_norm_eps + # backbone + if backbone_config is None and backbone is None: + logger.info( + "`backbone_config` and `backbone` are `None`. Initializing the config with the default `RTDetr-ResNet` backbone." + ) + backbone_config = RTDetrResNetConfig( + num_channels=3, + embedding_size=64, + hidden_sizes=[256, 512, 1024, 2048], + depths=[3, 4, 6, 3], + layer_type="bottleneck", + hidden_act="relu", + downsample_in_first_stage=False, + downsample_in_bottleneck=False, + out_features=None, + out_indices=[2, 3, 4], + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.freeze_backbone_batch_norms = freeze_backbone_batch_norms + self.backbone_kwargs = backbone_kwargs + # encoder + self.encoder_hidden_dim = encoder_hidden_dim + self.encoder_in_channels = encoder_in_channels + self.feat_strides = feat_strides + self.encoder_attention_heads = encoder_attention_heads + self.encoder_ffn_dim = encoder_ffn_dim + self.dropout = dropout + self.activation_dropout = activation_dropout + self.encode_proj_layers = encode_proj_layers + self.encoder_layers = encoder_layers + self.positional_encoding_temperature = positional_encoding_temperature + self.eval_size = eval_size + self.normalize_before = normalize_before + self.encoder_activation_function = encoder_activation_function + self.activation_function = activation_function + self.hidden_expansion = hidden_expansion + # decoder + self.d_model = d_model + self.num_queries = num_queries + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_in_channels = decoder_in_channels + self.num_feature_levels = num_feature_levels + self.decoder_n_points = decoder_n_points + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.decoder_activation_function = decoder_activation_function + self.attention_dropout = attention_dropout + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + self.learn_initial_query = learn_initial_query + self.anchor_image_size = anchor_image_size + self.auxiliary_loss = auxiliary_loss + self.disable_custom_kernels = disable_custom_kernels + self.with_box_refine = with_box_refine + # Loss + self.matcher_alpha = matcher_alpha + self.matcher_gamma = matcher_gamma + self.matcher_class_cost = matcher_class_cost + self.matcher_bbox_cost = matcher_bbox_cost + self.matcher_giou_cost = matcher_giou_cost + self.use_focal_loss = use_focal_loss + self.focal_loss_alpha = focal_loss_alpha + self.focal_loss_gamma = focal_loss_gamma + self.weight_loss_vfl = weight_loss_vfl + self.weight_loss_bbox = weight_loss_bbox + self.weight_loss_giou = weight_loss_giou + self.eos_coefficient = eos_coefficient + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + @classmethod + def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs): + """Instantiate a [`RTDetrConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model + configuration. + + Args: + backbone_config ([`PretrainedConfig`]): + The backbone configuration. + + Returns: + [`RTDetrConfig`]: An instance of a configuration object + """ + return cls( + backbone_config=backbone_config, + **kwargs, + ) + + +__all__ = ["RTDetrConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..73b9517ab1498fe7fb91fd8f67a0827a55cacc7d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py @@ -0,0 +1,114 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""RT-DETR ResNet model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class RTDetrResNetConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RTDetrResnetBackbone`]. It is used to instantiate an + ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the ResNet + [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embedding_size (`int`, *optional*, defaults to 64): + Dimensionality (hidden size) for the embedding layer. + hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`): + Dimensionality (hidden size) at each stage. + depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`): + Depth (number of layers) for each stage. + layer_type (`str`, *optional*, defaults to `"bottleneck"`): + The layer to use, it can be either `"basic"` (used for smaller models, like resnet-18 or resnet-34) or + `"bottleneck"` (used for larger models like resnet-50 and above). + hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` + are supported. + downsample_in_first_stage (`bool`, *optional*, defaults to `False`): + If `True`, the first stage will downsample the inputs using a `stride` of 2. + downsample_in_bottleneck (`bool`, *optional*, defaults to `False`): + If `True`, the first conv 1x1 in ResNetBottleNeckLayer will downsample the inputs using a `stride` of 2. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + + Example: + ```python + >>> from transformers import RTDetrResNetConfig, RTDetrResnetBackbone + + >>> # Initializing a ResNet resnet-50 style configuration + >>> configuration = RTDetrResNetConfig() + + >>> # Initializing a model (with random weights) from the resnet-50 style configuration + >>> model = RTDetrResnetBackbone(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "rt_detr_resnet" + layer_types = ["basic", "bottleneck"] + + def __init__( + self, + num_channels=3, + embedding_size=64, + hidden_sizes=[256, 512, 1024, 2048], + depths=[3, 4, 6, 3], + layer_type="bottleneck", + hidden_act="relu", + downsample_in_first_stage=False, + downsample_in_bottleneck=False, + out_features=None, + out_indices=None, + **kwargs, + ): + super().__init__(**kwargs) + if layer_type not in self.layer_types: + raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}") + self.num_channels = num_channels + self.embedding_size = embedding_size + self.hidden_sizes = hidden_sizes + self.depths = depths + self.layer_type = layer_type + self.hidden_act = hidden_act + self.downsample_in_first_stage = downsample_in_first_stage + self.downsample_in_bottleneck = downsample_in_bottleneck + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + + +__all__ = ["RTDetrResNetConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..9aae271deaccbab9784dd7e02059eba29d32d5f2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py @@ -0,0 +1,559 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/rt_detr/modular_rt_detr.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_rt_detr.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import pathlib +from typing import Any, Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) +from ...image_transforms import center_to_corners_format, corners_to_center_format +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + validate_annotations, +) +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring, requires_backends +from ...utils.import_utils import requires +from .image_processing_rt_detr import get_size_with_aspect_ratio + + +class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by RT-DETR. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) + + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } + + if keypoints: + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + return new_target + + +@auto_docstring +@requires(backends=("torchvision", "torch")) +class RTDetrImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + format = AnnotationFormat.COCO_DETECTION + do_resize = True + do_rescale = True + do_normalize = False + do_pad = False + size = {"height": 640, "width": 640} + default_to_square = False + model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = RTDetrFastImageProcessorKwargs + do_convert_annotations = True + + def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None: + # Backwards compatibility + do_convert_annotations = kwargs.get("do_convert_annotations") + do_normalize = kwargs.get("do_normalize") + if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None: + self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize + + super().__init__(**kwargs) + + def prepare_annotation( + self, + image: torch.Tensor, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into RT_DETR model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"] = None, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + def resize_annotation( + self, + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + interpolation: Optional["F.InterpolationMode"] = None, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`): + The resampling filter to use when resizing the masks. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + def pad( + self, + image: torch.Tensor, + padded_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @auto_docstring + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[RTDetrFastImageProcessorKwargs], + ) -> BatchFeature: + r""" + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + return super().preprocess(images, annotations, masks_path, **kwargs) + + def _preprocess( + self, + images: list["torch.Tensor"], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]], + masks_path: Optional[Union[str, pathlib.Path]], + return_segmentation_masks: bool, + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + do_convert_annotations: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + do_pad: bool, + pad_size: Optional[SizeDict], + format: Optional[Union[str, AnnotationFormat]], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + """ + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + data = {} + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( + image, + annotation, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=ChannelDimension.FIRST, + ) + + if do_resize: + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], + ) + image = resized_image + # Fused rescale and normalize + image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None + + if do_pad: + # depends on all resized image shapes so we need another loop + if pad_size is not None: + padded_size = (pad_size.height, pad_size.width) + else: + padded_size = get_max_height_width(images) + + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + image, pixel_mask, annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(image) + padded_annotations.append(annotation) + pixel_masks.append(pixel_mask) + images = padded_images + annotations = padded_annotations if annotations is not None else None + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + def post_process_object_detection( + self, + outputs, + threshold: float = 0.5, + target_sizes: Union[TensorType, list[tuple]] = None, + use_focal_loss: bool = True, + ): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.5): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + use_focal_loss (`bool` defaults to `True`): + Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied + to compute the scores of each detection, otherwise, a softmax function is used. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + requires_backends(self, ["torch"]) + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + # convert from relative cxcywh to absolute xyxy + boxes = center_to_corners_format(out_bbox) + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + if isinstance(target_sizes, list): + img_h, img_w = torch.as_tensor(target_sizes).unbind(1) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + num_top_queries = out_logits.shape[1] + num_classes = out_logits.shape[2] + + if use_focal_loss: + scores = torch.nn.functional.sigmoid(out_logits) + scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1) + labels = index % num_classes + index = index // num_classes + boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1])) + else: + scores = torch.nn.functional.softmax(out_logits)[:, :, :-1] + scores, labels = scores.max(dim=-1) + if scores.shape[1] > num_top_queries: + scores, index = torch.topk(scores, num_top_queries, dim=-1) + labels = torch.gather(labels, dim=1, index=index) + boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) + + results = [] + for score, label, box in zip(scores, labels, boxes): + results.append( + { + "scores": score[score > threshold], + "labels": label[score > threshold], + "boxes": box[score > threshold], + } + ) + + return results + + +__all__ = ["RTDetrImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4b64496969acb596b037841c745b109f3d11b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr.py @@ -0,0 +1,2013 @@ +# coding=utf-8 +# Copyright 2024 Baidu Inc and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch RT-DETR model.""" + +import math +import warnings +from dataclasses import dataclass +from functools import partial +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2CLS, ACT2FN +from ...image_transforms import center_to_corners_format, corners_to_center_format +from ...integrations import use_kernel_forward_from_hub +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ( + ModelOutput, + auto_docstring, + logging, + torch_int, +) +from ...utils.backbone_utils import load_backbone +from .configuration_rt_detr import RTDetrConfig + + +logger = logging.get_logger(__name__) + + +# TODO: Replace all occurrences of the checkpoint with the final one + + +@use_kernel_forward_from_hub("MultiScaleDeformableAttention") +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention +class MultiScaleDeformableAttention(nn.Module): + def forward( + self, + value: Tensor, + value_spatial_shapes: Tensor, + value_spatial_shapes_list: list[tuple], + level_start_index: Tensor, + sampling_locations: Tensor, + attention_weights: Tensor, + im2col_step: int, + ): + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes_list): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id] + .flatten(2) + .transpose(1, 2) + .reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, + sampling_grid_l_, + mode="bilinear", + padding_mode="zeros", + align_corners=False, + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the RTDetrDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + """ +) +class RTDetrDecoderOutput(ModelOutput): + r""" + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`): + Stacked intermediate logits (logits of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate predicted corners (predicted corners of each layer of the decoder). + initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked initial reference points (initial reference points of each layer of the decoder). + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_logits: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + intermediate_predicted_corners: Optional[torch.FloatTensor] = None + initial_reference_points: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the RT-DETR encoder-decoder model. + """ +) +class RTDetrModelOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`): + Stacked intermediate logits (logits of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate predicted corners (predicted corners of each layer of the decoder). + initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points used for the first decoder layer. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`): + Logits of predicted bounding boxes coordinates in the encoder stage. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + denoising_meta_values (`dict`): + Extra dictionary for the denoising related values. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_logits: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + intermediate_predicted_corners: Optional[torch.FloatTensor] = None + initial_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + init_reference_points: Optional[torch.FloatTensor] = None + enc_topk_logits: Optional[torch.FloatTensor] = None + enc_topk_bboxes: Optional[torch.FloatTensor] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + denoising_meta_values: Optional[dict] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`RTDetrForObjectDetection`]. + """ +) +class RTDetrObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~RTDetrImageProcessor.post_process_object_detection`] to retrieve the + unnormalized (absolute) bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`): + Stacked intermediate logits (logits of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate predicted corners (predicted corners of each layer of the decoder). + initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked initial reference points (initial reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the encoder. + enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the encoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + denoising_meta_values (`dict`): + Extra dictionary for the denoising related values + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_logits: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + intermediate_predicted_corners: Optional[torch.FloatTensor] = None + initial_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + init_reference_points: Optional[tuple[torch.FloatTensor]] = None + enc_topk_logits: Optional[torch.FloatTensor] = None + enc_topk_bboxes: Optional[torch.FloatTensor] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + denoising_meta_values: Optional[dict] = None + + +def _get_clones(partial_module, N): + return nn.ModuleList([partial_module() for i in range(N)]) + + +# Copied from transformers.models.conditional_detr.modeling_conditional_detr.inverse_sigmoid +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->RTDetr +class RTDetrFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->RTDetr +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `RTDetrFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = RTDetrFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +def get_contrastive_denoising_training_group( + targets, + num_classes, + num_queries, + class_embed, + num_denoising_queries=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, +): + """ + Creates a contrastive denoising training group using ground-truth samples. It adds noise to labels and boxes. + + Args: + targets (`list[dict]`): + The target objects, each containing 'class_labels' and 'boxes' for objects in an image. + num_classes (`int`): + Total number of classes in the dataset. + num_queries (`int`): + Number of query slots in the transformer. + class_embed (`callable`): + A function or a model layer to embed class labels. + num_denoising_queries (`int`, *optional*, defaults to 100): + Number of denoising queries. + label_noise_ratio (`float`, *optional*, defaults to 0.5): + Ratio of noise applied to labels. + box_noise_scale (`float`, *optional*, defaults to 1.0): + Scale of noise applied to bounding boxes. + Returns: + `tuple` comprising various elements: + - **input_query_class** (`torch.FloatTensor`) -- + Class queries with applied label noise. + - **input_query_bbox** (`torch.FloatTensor`) -- + Bounding box queries with applied box noise. + - **attn_mask** (`torch.FloatTensor`) -- + Attention mask for separating denoising and reconstruction queries. + - **denoising_meta_values** (`dict`) -- + Metadata including denoising positive indices, number of groups, and split sizes. + """ + + if num_denoising_queries <= 0: + return None, None, None, None + + num_ground_truths = [len(t["class_labels"]) for t in targets] + device = targets[0]["class_labels"].device + + max_gt_num = max(num_ground_truths) + if max_gt_num == 0: + return None, None, None, None + + num_groups_denoising_queries = num_denoising_queries // max_gt_num + num_groups_denoising_queries = 1 if num_groups_denoising_queries == 0 else num_groups_denoising_queries + # pad gt to max_num of a batch + batch_size = len(num_ground_truths) + + input_query_class = torch.full([batch_size, max_gt_num], num_classes, dtype=torch.int32, device=device) + input_query_bbox = torch.zeros([batch_size, max_gt_num, 4], device=device) + pad_gt_mask = torch.zeros([batch_size, max_gt_num], dtype=torch.bool, device=device) + + for i in range(batch_size): + num_gt = num_ground_truths[i] + if num_gt > 0: + input_query_class[i, :num_gt] = targets[i]["class_labels"] + input_query_bbox[i, :num_gt] = targets[i]["boxes"] + pad_gt_mask[i, :num_gt] = 1 + # each group has positive and negative queries. + input_query_class = input_query_class.tile([1, 2 * num_groups_denoising_queries]) + input_query_bbox = input_query_bbox.tile([1, 2 * num_groups_denoising_queries, 1]) + pad_gt_mask = pad_gt_mask.tile([1, 2 * num_groups_denoising_queries]) + # positive and negative mask + negative_gt_mask = torch.zeros([batch_size, max_gt_num * 2, 1], device=device) + negative_gt_mask[:, max_gt_num:] = 1 + negative_gt_mask = negative_gt_mask.tile([1, num_groups_denoising_queries, 1]) + positive_gt_mask = 1 - negative_gt_mask + # contrastive denoising training positive index + positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask + denoise_positive_idx = torch.nonzero(positive_gt_mask)[:, 1] + denoise_positive_idx = torch.split( + denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths] + ) + # total denoising queries + num_denoising_queries = torch_int(max_gt_num * 2 * num_groups_denoising_queries) + + if label_noise_ratio > 0: + mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5) + # randomly put a new one here + new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype) + input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class) + + if box_noise_scale > 0: + known_bbox = center_to_corners_format(input_query_bbox) + diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale + rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 + rand_part = torch.rand_like(input_query_bbox) + rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask) + rand_part *= rand_sign + known_bbox += rand_part * diff + known_bbox.clip_(min=0.0, max=1.0) + input_query_bbox = corners_to_center_format(known_bbox) + input_query_bbox = inverse_sigmoid(input_query_bbox) + + input_query_class = class_embed(input_query_class) + + target_size = num_denoising_queries + num_queries + attn_mask = torch.full([target_size, target_size], 0, dtype=torch.float, device=device) + # match query cannot see the reconstruction + attn_mask[num_denoising_queries:, :num_denoising_queries] = -torch.inf + + # reconstructions cannot see each other + for i in range(num_groups_denoising_queries): + idx_block_start = max_gt_num * 2 * i + idx_block_end = max_gt_num * 2 * (i + 1) + attn_mask[idx_block_start:idx_block_end, :idx_block_start] = -torch.inf + attn_mask[idx_block_start:idx_block_end, idx_block_end:num_denoising_queries] = -torch.inf + + denoising_meta_values = { + "dn_positive_idx": denoise_positive_idx, + "dn_num_group": num_groups_denoising_queries, + "dn_num_split": [num_denoising_queries, num_queries], + } + + return input_query_class, input_query_bbox, attn_mask, denoising_meta_values + + +class RTDetrConvEncoder(nn.Module): + """ + Convolutional backbone using the modeling_rt_detr_resnet.py. + + nn.BatchNorm2d layers are replaced by RTDetrFrozenBatchNorm2d as defined above. + https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetr_pytorch/src/nn/backbone/presnet.py#L142 + """ + + def __init__(self, config): + super().__init__() + + backbone = load_backbone(config) + + if config.freeze_backbone_batch_norms: + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = self.model.channels + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +class RTDetrConvNormLayer(nn.Module): + def __init__(self, config, in_channels, out_channels, kernel_size, stride, padding=None, activation=None): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2 if padding is None else padding, + bias=False, + ) + self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, hidden_state): + hidden_state = self.conv(hidden_state) + hidden_state = self.norm(hidden_state) + hidden_state = self.activation(hidden_state) + return hidden_state + + +class RTDetrEncoderLayer(nn.Module): + def __init__(self, config: RTDetrConfig): + super().__init__() + self.normalize_before = config.normalize_before + + # self-attention + self.self_attn = RTDetrMultiheadAttention( + embed_dim=config.encoder_hidden_dim, + num_heads=config.num_attention_heads, + dropout=config.dropout, + ) + self.self_attn_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.encoder_activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(config.encoder_hidden_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, config.encoder_hidden_dim) + self.final_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + **kwargs, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + position_embeddings (`torch.FloatTensor`, *optional*): + Object queries (also called content embeddings), to be added to the hidden states. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + if self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + if self.normalize_before: + hidden_states = self.final_layer_norm(hidden_states) + residual = hidden_states + + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class RTDetrRepVggBlock(nn.Module): + """ + RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again". + """ + + def __init__(self, config: RTDetrConfig): + super().__init__() + + activation = config.activation_function + hidden_channels = int(config.encoder_hidden_dim * config.hidden_expansion) + self.conv1 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1, padding=1) + self.conv2 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1, padding=0) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, x): + y = self.conv1(x) + self.conv2(x) + return self.activation(y) + + +class RTDetrCSPRepLayer(nn.Module): + """ + Cross Stage Partial (CSP) network layer with RepVGG blocks. + """ + + def __init__(self, config: RTDetrConfig): + super().__init__() + + in_channels = config.encoder_hidden_dim * 2 + out_channels = config.encoder_hidden_dim + num_blocks = 3 + activation = config.activation_function + + hidden_channels = int(out_channels * config.hidden_expansion) + self.conv1 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.conv2 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.bottlenecks = nn.Sequential(*[RTDetrRepVggBlock(config) for _ in range(num_blocks)]) + if hidden_channels != out_channels: + self.conv3 = RTDetrConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation) + else: + self.conv3 = nn.Identity() + + def forward(self, hidden_state): + hidden_state_1 = self.conv1(hidden_state) + hidden_state_1 = self.bottlenecks(hidden_state_1) + hidden_state_2 = self.conv2(hidden_state) + return self.conv3(hidden_state_1 + hidden_state_2) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RTDetr +class RTDetrMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: RTDetrConfig, num_heads: int, n_points: int): + super().__init__() + + self.attn = MultiScaleDeformableAttention() + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in RTDetrMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + total_elements = sum(height * width for height, width in spatial_shapes_list) + if total_elements != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + output = self.attn( + value, + spatial_shapes, + spatial_shapes_list, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + output = self.output_proj(output) + + return output, attention_weights + + +class RTDetrMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._reshape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._reshape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._reshape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [seq_len, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = attention_mask.expand(batch_size, 1, *attention_mask.size()) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + if attention_mask.dtype == torch.bool: + attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_( + attention_mask, -torch.inf + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class RTDetrDecoderLayer(nn.Module): + def __init__(self, config: RTDetrConfig): + super().__init__() + # self-attention + self.self_attn = RTDetrMultiheadAttention( + embed_dim=config.d_model, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.decoder_activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps) + # cross-attention + self.encoder_attn = RTDetrMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps) + # feedforward neural networks + self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model) + self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +@auto_docstring +class RTDetrPreTrainedModel(PreTrainedModel): + config: RTDetrConfig + base_model_prefix = "rt_detr" + main_input_name = "pixel_values" + _no_split_modules = [r"RTDetrHybridEncoder", r"RTDetrDecoderLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (RTDetrForObjectDetection, RTDetrDecoder)): + if module.class_embed is not None: + for layer in module.class_embed: + prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) + bias = float(-math.log((1 - prior_prob) / prior_prob)) + nn.init.xavier_uniform_(layer.weight) + nn.init.constant_(layer.bias, bias) + + if module.bbox_embed is not None: + for layer in module.bbox_embed: + nn.init.constant_(layer.layers[-1].weight, 0) + nn.init.constant_(layer.layers[-1].bias, 0) + + elif isinstance(module, RTDetrMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + + elif isinstance(module, RTDetrModel): + prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) + bias = float(-math.log((1 - prior_prob) / prior_prob)) + nn.init.xavier_uniform_(module.enc_score_head.weight) + nn.init.constant_(module.enc_score_head.bias, bias) + + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + if hasattr(module, "weight_embedding") and self.config.learn_initial_query: + nn.init.xavier_uniform_(module.weight_embedding.weight) + if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0: + nn.init.xavier_uniform_(module.denoising_class_embed.weight) + + +class RTDetrEncoder(nn.Module): + def __init__(self, config: RTDetrConfig): + super().__init__() + + self.layers = nn.ModuleList([RTDetrEncoderLayer(config) for _ in range(config.encoder_layers)]) + + def forward(self, src, src_mask=None, pos_embed=None, output_attentions: bool = False) -> torch.Tensor: + hidden_states = src + for layer in self.layers: + hidden_states = layer( + hidden_states, + attention_mask=src_mask, + position_embeddings=pos_embed, + output_attentions=output_attentions, + ) + return hidden_states + + +class RTDetrHybridEncoder(nn.Module): + """ + Decoder consisting of a projection layer, a set of `RTDetrEncoder`, a top-down Feature Pyramid Network + (FPN) and a bottom-up Path Aggregation Network (PAN). More details on the paper: https://huggingface.co/papers/2304.08069 + + Args: + config: RTDetrConfig + """ + + def __init__(self, config: RTDetrConfig): + super().__init__() + self.config = config + self.in_channels = config.encoder_in_channels + self.feat_strides = config.feat_strides + self.encoder_hidden_dim = config.encoder_hidden_dim + self.encode_proj_layers = config.encode_proj_layers + self.positional_encoding_temperature = config.positional_encoding_temperature + self.eval_size = config.eval_size + self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels] + self.out_strides = self.feat_strides + self.num_fpn_stages = len(self.in_channels) - 1 + self.num_pan_stages = len(self.in_channels) - 1 + activation = config.activation_function + + # encoder transformer + self.encoder = nn.ModuleList([RTDetrEncoder(config) for _ in range(len(self.encode_proj_layers))]) + + # top-down FPN + self.lateral_convs = nn.ModuleList() + self.fpn_blocks = nn.ModuleList() + for _ in range(self.num_fpn_stages): + lateral_conv = RTDetrConvNormLayer( + config, + in_channels=self.encoder_hidden_dim, + out_channels=self.encoder_hidden_dim, + kernel_size=1, + stride=1, + activation=activation, + ) + fpn_block = RTDetrCSPRepLayer(config) + self.lateral_convs.append(lateral_conv) + self.fpn_blocks.append(fpn_block) + + # bottom-up PAN + self.downsample_convs = nn.ModuleList() + self.pan_blocks = nn.ModuleList() + for _ in range(self.num_pan_stages): + downsample_conv = RTDetrConvNormLayer( + config, + in_channels=self.encoder_hidden_dim, + out_channels=self.encoder_hidden_dim, + kernel_size=3, + stride=2, + activation=activation, + ) + pan_block = RTDetrCSPRepLayer(config) + self.downsample_convs.append(downsample_conv) + self.pan_blocks.append(pan_block) + + @staticmethod + def build_2d_sincos_position_embedding( + width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32 + ): + grid_w = torch.arange(torch_int(width), device=device).to(dtype) + grid_h = torch.arange(torch_int(height), device=device).to(dtype) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") + if embed_dim % 4 != 0: + raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding") + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, device=device).to(dtype) / pos_dim + omega = 1.0 / (temperature**omega) + + out_w = grid_w.flatten()[..., None] @ omega[None] + out_h = grid_h.flatten()[..., None] @ omega[None] + + return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :] + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # encoder + if self.config.encoder_layers > 0: + for i, enc_ind in enumerate(self.encode_proj_layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states[enc_ind],) + height, width = hidden_states[enc_ind].shape[2:] + # flatten [batch, channel, height, width] to [batch, height*width, channel] + src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1) + if self.training or self.eval_size is None: + pos_embed = self.build_2d_sincos_position_embedding( + width, + height, + self.encoder_hidden_dim, + self.positional_encoding_temperature, + device=src_flatten.device, + dtype=src_flatten.dtype, + ) + else: + pos_embed = None + + layer_outputs = self.encoder[i]( + src_flatten, + pos_embed=pos_embed, + output_attentions=output_attentions, + ) + hidden_states[enc_ind] = ( + layer_outputs[0].permute(0, 2, 1).reshape(-1, self.encoder_hidden_dim, height, width).contiguous() + ) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states[enc_ind],) + + # top-down FPN + fpn_feature_maps = [hidden_states[-1]] + for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)): + backbone_feature_map = hidden_states[self.num_fpn_stages - idx - 1] + top_fpn_feature_map = fpn_feature_maps[-1] + # apply lateral block + top_fpn_feature_map = lateral_conv(top_fpn_feature_map) + fpn_feature_maps[-1] = top_fpn_feature_map + # apply fpn block + top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest") + fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1) + new_fpn_feature_map = fpn_block(fused_feature_map) + fpn_feature_maps.append(new_fpn_feature_map) + + fpn_feature_maps = fpn_feature_maps[::-1] + + # bottom-up PAN + pan_feature_maps = [fpn_feature_maps[0]] + for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)): + top_pan_feature_map = pan_feature_maps[-1] + fpn_feature_map = fpn_feature_maps[idx + 1] + downsampled_feature_map = downsample_conv(top_pan_feature_map) + fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1) + new_pan_feature_map = pan_block(fused_feature_map) + pan_feature_maps.append(new_pan_feature_map) + + if not return_dict: + return tuple(v for v in [pan_feature_maps, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=pan_feature_maps, hidden_states=encoder_states, attentions=all_attentions + ) + + +class RTDetrDecoder(RTDetrPreTrainedModel): + def __init__(self, config: RTDetrConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([RTDetrDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.query_pos_head = RTDetrMLPPredictionHead(config, 4, 2 * config.d_model, config.d_model, num_layers=2) + + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + intermediate_logits = () + + reference_points = F.sigmoid(reference_points) + + # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L252 + for idx, decoder_layer in enumerate(self.layers): + reference_points_input = reference_points.unsqueeze(2) + position_embeddings = self.query_pos_head(reference_points) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + predicted_corners = self.bbox_embed[idx](hidden_states) + new_reference_points = F.sigmoid(predicted_corners + inverse_sigmoid(reference_points)) + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += ( + (new_reference_points,) if self.bbox_embed is not None else (reference_points,) + ) + + if self.class_embed is not None: + logits = self.class_embed[idx](hidden_states) + intermediate_logits += (logits,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + if self.class_embed is not None: + intermediate_logits = torch.stack(intermediate_logits, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_logits, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return RTDetrDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_logits=intermediate_logits, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class RTDetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_paddle/ppdet/modeling/transformers/utils.py#L453 + + """ + + def __init__(self, config, input_dim, d_model, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [d_model] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@auto_docstring( + custom_intro=""" + RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top. + """ +) +class RTDetrModel(RTDetrPreTrainedModel): + def __init__(self, config: RTDetrConfig): + super().__init__(config) + + # Create backbone + self.backbone = RTDetrConvEncoder(config) + intermediate_channel_sizes = self.backbone.intermediate_channel_sizes + + # Create encoder input projection layers + # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py#L212 + num_backbone_outs = len(intermediate_channel_sizes) + encoder_input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = intermediate_channel_sizes[_] + encoder_input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.encoder_hidden_dim, kernel_size=1, bias=False), + nn.BatchNorm2d(config.encoder_hidden_dim), + ) + ) + self.encoder_input_proj = nn.ModuleList(encoder_input_proj_list) + + # Create encoder + self.encoder = RTDetrHybridEncoder(config) + + # denoising part + if config.num_denoising > 0: + self.denoising_class_embed = nn.Embedding( + config.num_labels + 1, config.d_model, padding_idx=config.num_labels + ) + + # decoder embedding + if config.learn_initial_query: + self.weight_embedding = nn.Embedding(config.num_queries, config.d_model) + + # encoder head + self.enc_output = nn.Sequential( + nn.Linear(config.d_model, config.d_model), + nn.LayerNorm(config.d_model, eps=config.layer_norm_eps), + ) + self.enc_score_head = nn.Linear(config.d_model, config.num_labels) + self.enc_bbox_head = RTDetrMLPPredictionHead(config, config.d_model, config.d_model, 4, num_layers=3) + + # init encoder output anchors and valid_mask + if config.anchor_image_size: + self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype) + + # Create decoder input projection layers + # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412 + num_backbone_outs = len(config.decoder_in_channels) + decoder_input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = config.decoder_in_channels[_] + decoder_input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False), + nn.BatchNorm2d(config.d_model, config.batch_norm_eps), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + decoder_input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False), + nn.BatchNorm2d(config.d_model, config.batch_norm_eps), + ) + ) + in_channels = config.d_model + self.decoder_input_proj = nn.ModuleList(decoder_input_proj_list) + + # decoder + self.decoder = RTDetrDecoder(config) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def freeze_backbone(self): + for param in self.backbone.parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for param in self.backbone.parameters(): + param.requires_grad_(True) + + @compile_compatible_method_lru_cache(maxsize=32) + def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32): + if spatial_shapes is None: + spatial_shapes = [ + [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)] + for s in self.config.feat_strides + ] + anchors = [] + for level, (height, width) in enumerate(spatial_shapes): + grid_y, grid_x = torch.meshgrid( + torch.arange(end=height, device=device).to(dtype), + torch.arange(end=width, device=device).to(dtype), + indexing="ij", + ) + grid_xy = torch.stack([grid_x, grid_y], -1) + grid_xy = grid_xy.unsqueeze(0) + 0.5 + grid_xy[..., 0] /= width + grid_xy[..., 1] /= height + wh = torch.ones_like(grid_xy) * grid_size * (2.0**level) + anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4)) + # define the valid range for anchor coordinates + eps = 1e-2 + anchors = torch.concat(anchors, 1) + valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) + anchors = torch.log(anchors / (1 - anchors)) + anchors = torch.where(valid_mask, anchors, torch.tensor(torch.finfo(dtype).max, dtype=dtype, device=device)) + + return anchors, valid_mask + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], RTDetrModelOutput]: + r""" + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, RTDetrModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") + >>> model = RTDetrModel.from_pretrained("PekingU/rtdetr_r50vd") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), device=device) + + features = self.backbone(pixel_values, pixel_mask) + + proj_feats = [self.encoder_input_proj[level](source) for level, (source, mask) in enumerate(features)] + + if encoder_outputs is None: + encoder_outputs = self.encoder( + proj_feats, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if output_hidden_states else None, + attentions=encoder_outputs[2] + if len(encoder_outputs) > 2 + else encoder_outputs[1] + if output_attentions + else None, + ) + + # Equivalent to def _get_encoder_input + # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412 + sources = [] + for level, source in enumerate(encoder_outputs[0]): + sources.append(self.decoder_input_proj[level](source)) + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + sources.append(self.decoder_input_proj[_len_sources](encoder_outputs[0])[-1]) + for i in range(_len_sources + 1, self.config.num_feature_levels): + sources.append(self.decoder_input_proj[i](encoder_outputs[0][-1])) + + # Prepare encoder inputs (by flattening) + source_flatten = [] + spatial_shapes_list = [] + spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long) + for level, source in enumerate(sources): + height, width = source.shape[-2:] + spatial_shapes[level, 0] = height + spatial_shapes[level, 1] = width + spatial_shapes_list.append((height, width)) + source = source.flatten(2).transpose(1, 2) + source_flatten.append(source) + source_flatten = torch.cat(source_flatten, 1) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + # prepare denoising training + if self.training and self.config.num_denoising > 0 and labels is not None: + ( + denoising_class, + denoising_bbox_unact, + attention_mask, + denoising_meta_values, + ) = get_contrastive_denoising_training_group( + targets=labels, + num_classes=self.config.num_labels, + num_queries=self.config.num_queries, + class_embed=self.denoising_class_embed, + num_denoising_queries=self.config.num_denoising, + label_noise_ratio=self.config.label_noise_ratio, + box_noise_scale=self.config.box_noise_scale, + ) + else: + denoising_class, denoising_bbox_unact, attention_mask, denoising_meta_values = None, None, None, None + + batch_size = len(source_flatten) + device = source_flatten.device + dtype = source_flatten.dtype + + # prepare input for decoder + if self.training or self.config.anchor_image_size is None: + # Pass spatial_shapes as tuple to make it hashable and make sure + # lru_cache is working for generate_anchors() + spatial_shapes_tuple = tuple(spatial_shapes_list) + anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype) + else: + anchors, valid_mask = self.anchors, self.valid_mask + anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype) + + # use the valid_mask to selectively retain values in the feature map where the mask is `True` + memory = valid_mask.to(source_flatten.dtype) * source_flatten + + output_memory = self.enc_output(memory) + + enc_outputs_class = self.enc_score_head(output_memory) + enc_outputs_coord_logits = self.enc_bbox_head(output_memory) + anchors + + _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.config.num_queries, dim=1) + + reference_points_unact = enc_outputs_coord_logits.gather( + dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_logits.shape[-1]) + ) + + enc_topk_bboxes = F.sigmoid(reference_points_unact) + if denoising_bbox_unact is not None: + reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1) + + enc_topk_logits = enc_outputs_class.gather( + dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]) + ) + + # extract region features + if self.config.learn_initial_query: + target = self.weight_embedding.tile([batch_size, 1, 1]) + else: + target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1])) + target = target.detach() + + if denoising_class is not None: + target = torch.concat([denoising_class, target], 1) + + init_reference_points = reference_points_unact.detach() + + # decoder + decoder_outputs = self.decoder( + inputs_embeds=target, + encoder_hidden_states=source_flatten, + encoder_attention_mask=attention_mask, + reference_points=init_reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple( + value + for value in [enc_topk_logits, enc_topk_bboxes, enc_outputs_class, enc_outputs_coord_logits] + if value is not None + ) + dn_outputs = tuple(value if value is not None else None for value in [denoising_meta_values]) + tuple_outputs = decoder_outputs + encoder_outputs + (init_reference_points,) + enc_outputs + dn_outputs + + return tuple_outputs + + return RTDetrModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_logits=decoder_outputs.intermediate_logits, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + intermediate_predicted_corners=decoder_outputs.intermediate_predicted_corners, + initial_reference_points=decoder_outputs.initial_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + init_reference_points=init_reference_points, + enc_topk_logits=enc_topk_logits, + enc_topk_bboxes=enc_topk_bboxes, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + denoising_meta_values=denoising_meta_values, + ) + + +@auto_docstring( + custom_intro=""" + RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further + decoded into scores and classes. + """ +) +class RTDetrForObjectDetection(RTDetrPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = ["bbox_embed", "class_embed"] + # We can't initialize the model on meta device as some weights are modified during the initialization + _no_split_modules = None + + def __init__(self, config: RTDetrConfig): + super().__init__(config) + + # RTDETR encoder-decoder model + self.model = RTDetrModel(config) + + # Detection heads on top + self.class_embed = partial(nn.Linear, config.d_model, config.num_labels) + self.bbox_embed = partial(RTDetrMLPPredictionHead, config, config.d_model, config.d_model, 4, num_layers=3) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = config.decoder_layers + if config.with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + else: + self.class_embed = nn.ModuleList([self.class_embed() for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed() for _ in range(num_pred)]) + + # hack implementation for iterative bounding box refinement + self.model.decoder.class_embed = self.class_embed + self.model.decoder.bbox_embed = self.bbox_embed + + # Initialize weights and apply final processing + self.post_init() + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)] + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.FloatTensor], RTDetrObjectDetectionOutput]: + r""" + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> from transformers import RTDetrImageProcessor, RTDetrForObjectDetection + >>> from PIL import Image + >>> import requests + >>> import torch + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") + >>> model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> list(logits.shape) + [1, 300, 80] + + >>> boxes = outputs.pred_boxes + >>> list(boxes.shape) + [1, 300, 4] + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[ + ... 0 + ... ] + + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected sofa with confidence 0.97 at location [0.14, 0.38, 640.13, 476.21] + Detected cat with confidence 0.96 at location [343.38, 24.28, 640.14, 371.5] + Detected cat with confidence 0.958 at location [13.23, 54.18, 318.98, 472.22] + Detected remote with confidence 0.951 at location [40.11, 73.44, 175.96, 118.48] + Detected remote with confidence 0.924 at location [333.73, 76.58, 369.97, 186.99] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + denoising_meta_values = ( + outputs.denoising_meta_values if return_dict else outputs[-1] if self.training else None + ) + + outputs_class = outputs.intermediate_logits if return_dict else outputs[2] + outputs_coord = outputs.intermediate_reference_points if return_dict else outputs[3] + predicted_corners = outputs.intermediate_predicted_corners if return_dict else outputs[4] + initial_reference_points = outputs.initial_reference_points if return_dict else outputs[5] + + logits = outputs_class[:, -1] + pred_boxes = outputs_coord[:, -1] + + loss, loss_dict, auxiliary_outputs, enc_topk_logits, enc_topk_bboxes = None, None, None, None, None + if labels is not None: + enc_topk_logits = outputs.enc_topk_logits if return_dict else outputs[-5] + enc_topk_bboxes = outputs.enc_topk_bboxes if return_dict else outputs[-4] + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + outputs_class, + outputs_coord, + enc_topk_logits=enc_topk_logits, + enc_topk_bboxes=enc_topk_bboxes, + denoising_meta_values=denoising_meta_values, + predicted_corners=predicted_corners, + initial_reference_points=initial_reference_points, + **kwargs, + ) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + (auxiliary_outputs,) + outputs + else: + output = (logits, pred_boxes) + outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return RTDetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_logits=outputs.intermediate_logits, + intermediate_reference_points=outputs.intermediate_reference_points, + intermediate_predicted_corners=outputs.intermediate_predicted_corners, + initial_reference_points=outputs.initial_reference_points, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + init_reference_points=outputs.init_reference_points, + enc_topk_logits=outputs.enc_topk_logits, + enc_topk_bboxes=outputs.enc_topk_bboxes, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + denoising_meta_values=outputs.denoising_meta_values, + ) + + +__all__ = [ + "RTDetrForObjectDetection", + "RTDetrModel", + "RTDetrPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..21781dc3573fd8a16f44640ea790109aed720389 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py @@ -0,0 +1,399 @@ +# coding=utf-8 +# Copyright 2024 Microsoft Research, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +PyTorch RTDetr specific ResNet model. The main difference between hugginface ResNet model is that this RTDetrResNet model forces to use shortcut at the first layer in the resnet-18/34 models. +See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L126 for details. +""" + +import math +from typing import Optional + +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from ...utils.backbone_utils import BackboneMixin +from .configuration_rt_detr_resnet import RTDetrResNetConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.resnet.modeling_resnet.ResNetConvLayer -> RTDetrResNetConvLayer +class RTDetrResNetConvLayer(nn.Module): + def __init__( + self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu" + ): + super().__init__() + self.convolution = nn.Conv2d( + in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False + ) + self.normalization = nn.BatchNorm2d(out_channels) + self.activation = ACT2FN[activation] if activation is not None else nn.Identity() + + def forward(self, input: Tensor) -> Tensor: + hidden_state = self.convolution(input) + hidden_state = self.normalization(hidden_state) + hidden_state = self.activation(hidden_state) + return hidden_state + + +class RTDetrResNetEmbeddings(nn.Module): + """ + ResNet Embeddings (stem) composed of a deep aggressive convolution. + """ + + def __init__(self, config: RTDetrResNetConfig): + super().__init__() + self.embedder = nn.Sequential( + *[ + RTDetrResNetConvLayer( + config.num_channels, + config.embedding_size // 2, + kernel_size=3, + stride=2, + activation=config.hidden_act, + ), + RTDetrResNetConvLayer( + config.embedding_size // 2, + config.embedding_size // 2, + kernel_size=3, + stride=1, + activation=config.hidden_act, + ), + RTDetrResNetConvLayer( + config.embedding_size // 2, + config.embedding_size, + kernel_size=3, + stride=1, + activation=config.hidden_act, + ), + ] + ) + self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.num_channels = config.num_channels + + def forward(self, pixel_values: Tensor) -> Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embedding = self.embedder(pixel_values) + embedding = self.pooler(embedding) + return embedding + + +# Copied from transformers.models.resnet.modeling_resnet.ResNetShortCut -> RTDetrResNetChortCut +class RTDetrResNetShortCut(nn.Module): + """ + ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to + downsample the input using `stride=2`. + """ + + def __init__(self, in_channels: int, out_channels: int, stride: int = 2): + super().__init__() + self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False) + self.normalization = nn.BatchNorm2d(out_channels) + + def forward(self, input: Tensor) -> Tensor: + hidden_state = self.convolution(input) + hidden_state = self.normalization(hidden_state) + return hidden_state + + +class RTDetrResNetBasicLayer(nn.Module): + """ + A classic ResNet's residual layer composed by two `3x3` convolutions. + See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L34. + """ + + def __init__( + self, + config: RTDetrResNetConfig, + in_channels: int, + out_channels: int, + stride: int = 1, + should_apply_shortcut: bool = False, + ): + super().__init__() + if in_channels != out_channels: + self.shortcut = ( + nn.Sequential( + *[nn.AvgPool2d(2, 2, 0, ceil_mode=True), RTDetrResNetShortCut(in_channels, out_channels, stride=1)] + ) + if should_apply_shortcut + else nn.Identity() + ) + else: + self.shortcut = ( + RTDetrResNetShortCut(in_channels, out_channels, stride=stride) + if should_apply_shortcut + else nn.Identity() + ) + self.layer = nn.Sequential( + RTDetrResNetConvLayer(in_channels, out_channels, stride=stride), + RTDetrResNetConvLayer(out_channels, out_channels, activation=None), + ) + self.activation = ACT2FN[config.hidden_act] + + def forward(self, hidden_state): + residual = hidden_state + hidden_state = self.layer(hidden_state) + residual = self.shortcut(residual) + hidden_state += residual + hidden_state = self.activation(hidden_state) + return hidden_state + + +class RTDetrResNetBottleNeckLayer(nn.Module): + """ + A classic RTDetrResNet's bottleneck layer composed by three `3x3` convolutions. + + The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3` + convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If + `downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer. + """ + + def __init__( + self, + config: RTDetrResNetConfig, + in_channels: int, + out_channels: int, + stride: int = 1, + ): + super().__init__() + reduction = 4 + should_apply_shortcut = in_channels != out_channels or stride != 1 + reduces_channels = out_channels // reduction + if stride == 2: + self.shortcut = nn.Sequential( + *[ + nn.AvgPool2d(2, 2, 0, ceil_mode=True), + RTDetrResNetShortCut(in_channels, out_channels, stride=1) + if should_apply_shortcut + else nn.Identity(), + ] + ) + else: + self.shortcut = ( + RTDetrResNetShortCut(in_channels, out_channels, stride=stride) + if should_apply_shortcut + else nn.Identity() + ) + self.layer = nn.Sequential( + RTDetrResNetConvLayer( + in_channels, reduces_channels, kernel_size=1, stride=stride if config.downsample_in_bottleneck else 1 + ), + RTDetrResNetConvLayer( + reduces_channels, reduces_channels, stride=stride if not config.downsample_in_bottleneck else 1 + ), + RTDetrResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None), + ) + self.activation = ACT2FN[config.hidden_act] + + def forward(self, hidden_state): + residual = hidden_state + hidden_state = self.layer(hidden_state) + residual = self.shortcut(residual) + hidden_state += residual + hidden_state = self.activation(hidden_state) + return hidden_state + + +class RTDetrResNetStage(nn.Module): + """ + A RTDetrResNet stage composed by stacked layers. + """ + + def __init__( + self, + config: RTDetrResNetConfig, + in_channels: int, + out_channels: int, + stride: int = 2, + depth: int = 2, + ): + super().__init__() + + layer = RTDetrResNetBottleNeckLayer if config.layer_type == "bottleneck" else RTDetrResNetBasicLayer + + if config.layer_type == "bottleneck": + first_layer = layer( + config, + in_channels, + out_channels, + stride=stride, + ) + else: + first_layer = layer(config, in_channels, out_channels, stride=stride, should_apply_shortcut=True) + self.layers = nn.Sequential( + first_layer, *[layer(config, out_channels, out_channels) for _ in range(depth - 1)] + ) + + def forward(self, input: Tensor) -> Tensor: + hidden_state = input + for layer in self.layers: + hidden_state = layer(hidden_state) + return hidden_state + + +# Copied from transformers.models.resnet.modeling_resnet.ResNetEncoder with ResNet->RTDetrResNet +class RTDetrResNetEncoder(nn.Module): + def __init__(self, config: RTDetrResNetConfig): + super().__init__() + self.stages = nn.ModuleList([]) + # based on `downsample_in_first_stage` the first layer of the first stage may or may not downsample the input + self.stages.append( + RTDetrResNetStage( + config, + config.embedding_size, + config.hidden_sizes[0], + stride=2 if config.downsample_in_first_stage else 1, + depth=config.depths[0], + ) + ) + in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:]) + for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]): + self.stages.append(RTDetrResNetStage(config, in_channels, out_channels, depth=depth)) + + def forward( + self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True + ) -> BaseModelOutputWithNoAttention: + hidden_states = () if output_hidden_states else None + + for stage_module in self.stages: + if output_hidden_states: + hidden_states = hidden_states + (hidden_state,) + + hidden_state = stage_module(hidden_state) + + if output_hidden_states: + hidden_states = hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention( + last_hidden_state=hidden_state, + hidden_states=hidden_states, + ) + + +@auto_docstring +# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->RTDetrResNet +class RTDetrResNetPreTrainedModel(PreTrainedModel): + config: RTDetrResNetConfig + base_model_prefix = "resnet" + main_input_name = "pixel_values" + _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"] + + def _init_weights(self, module): + if isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`. + elif isinstance(module, nn.Linear): + nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(module.bias, -bound, bound) + elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(module.weight, 1) + nn.init.constant_(module.bias, 0) + + +@auto_docstring( + custom_intro=""" + ResNet backbone, to be used with frameworks like RTDETR. + """ +) +class RTDetrResNetBackbone(RTDetrResNetPreTrainedModel, BackboneMixin): + has_attentions = False + + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + self.num_features = [config.embedding_size] + config.hidden_sizes + self.embedder = RTDetrResNetEmbeddings(config) + self.encoder = RTDetrResNetEncoder(config) + + # initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import RTDetrResNetConfig, RTDetrResNetBackbone + >>> import torch + from ...utils.deprecation import deprecate_kwarg + from ...utils.deprecation import deprecate_kwarg + from ...utils.deprecation import deprecate_kwarg + from ...utils.deprecation import deprecate_kwarg + from ...utils.deprecation import deprecate_kwarg + + >>> config = RTDetrResNetConfig() + >>> model = RTDetrResNetBackbone(config) + + >>> pixel_values = torch.randn(1, 3, 224, 224) + + >>> with torch.no_grad(): + ... outputs = model(pixel_values) + + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 2048, 7, 7] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + embedding_output = self.embedder(pixel_values) + + outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True) + + hidden_states = outputs.hidden_states + + feature_maps = () + for idx, stage in enumerate(self.stage_names): + if stage in self.out_features: + feature_maps += (hidden_states[idx],) + + if not return_dict: + output = (feature_maps,) + if output_hidden_states: + output += (outputs.hidden_states,) + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=None, + ) + + +__all__ = [ + "RTDetrResNetBackbone", + "RTDetrResNetPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..efc3772ca6d015271102638c3948958777498fd9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_rt_detr_v2 import * + from .modeling_rt_detr_v2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4d2b9898eb232fae0f84109a6c4a6bff8f09e07 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/configuration_rt_detr_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/configuration_rt_detr_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d1a6de2b4835854cf0937b07e79bffd65d6a6dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/configuration_rt_detr_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/modular_rt_detr_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/modular_rt_detr_v2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3bdfaa2198b83b34e5cd35ea80d2eac653d26c0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/modular_rt_detr_v2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86c8cc7df11e5a343a21a477a4e2a9ce68e550aa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/configuration_rwkv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/configuration_rwkv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92f7f260b446734e89ef2b8a19b67847f52af55d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/configuration_rwkv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/modeling_rwkv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/modeling_rwkv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8690e0c7bbf4635a652cfc02f78f2cfd6fd060f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/modeling_rwkv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32cbe1887c9f8b5aad56f1c6a8d4c965c94ad9d5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66d690a1589905f3f7081450ae779bc0f4ed8858 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25eba43567fda06ab476bc28de6ac5d79118750b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d63f7e32f525a5bec7699cb0b7a96daa84b8bf1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0d289de0474acfadeeee08019dd57f40f9a1a4b4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_seamless_m4t import * + from .feature_extraction_seamless_m4t import * + from .modeling_seamless_m4t import * + from .processing_seamless_m4t import * + from .tokenization_seamless_m4t import * + from .tokenization_seamless_m4t_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/configuration_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/configuration_seamless_m4t.py new file mode 100644 index 0000000000000000000000000000000000000000..fe4f911957fcb83f91113da96aeeb9280a0638fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/configuration_seamless_m4t.py @@ -0,0 +1,416 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SeamlessM4T model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class SeamlessM4TConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`~SeamlessM4TModel`]. It is used to instantiate an + SeamlessM4T model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the SeamlessM4T + ["facebook/hf-seamless-m4t-medium"](https://huggingface.co/"facebook/hf-seamless-m4t-medium") architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 256102): + Vocabulary size of the SeamlessM4T model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`~SeamlessM4TModel`], [`~SeamlessM4TForTextToSpeech`] or + [`~SeamlessM4TForTextToText`]. + t2u_vocab_size (`int`, *optional*, defaults to 10082): + Unit vocabulary size of the SeamlessM4T model. Defines the number of different unit tokens that can be + represented by the `inputs_ids` passed when calling the Text-To-Units sub-model of [`~SeamlessM4TModel`], + [`~SeamlessM4TForSpeechToSpeech`] or [`~SeamlessM4TForTextToSpeech`]. + + > Parameters shared across sub-models + + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the "intermediate" layers in the architecture. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model text encoder and decoder might ever be used with. Typically set + this to something large just in case (e.g., 512 or 1024 or 2048). + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as an encoder/decoder or not. + encoder_layerdrop (`float`, *optional*, defaults to 0.05): + The LayerDrop probability for the encoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.05): + The LayerDrop probability for the decoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the decoder and feed-forward layers. If string, + `"gelu"`, `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, decoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all attention layers. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all activation layers in the model. + scale_embedding (`bool`, *optional*, defaults to `True`): + Scale embeddings by diving by sqrt(d_model). + + > Text encoder and text decoder specific parameters + + encoder_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer text encoder. + encoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text encoder. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text encoder. + decoder_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer text decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text decoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text decoder. + decoder_start_token_id (`int`, *optional*, defaults to 3): + If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only + applied in the text decoder. + max_new_tokens (`int`, *optional*, defaults to 256): + The maximum numbers of text tokens to generate, ignoring the number of tokens in the prompt. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the _padding_ text token. Only applied to the text-decoder model. + bos_token_id (`int`, *optional*, defaults to 2): + The id of the _beginning-of-stream_ text token. Only applied to the text-decoder model. + eos_token_id (`int`, *optional*, defaults to 3): + The id of the _end-of-stream_ text token. Only applied to the text-decoder model. + + > Speech encoder specific parameters + + speech_encoder_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer speech encoder. + speech_encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer speech encoder. + speech_encoder_intermediate_size (`int`, *optional*, defaults to 4096): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer speech encoder. + speech_encoder_hidden_act (`str` or `function`, *optional*, defaults to `"swish"`): + The non-linear activation function (function or string) in the speech encoder. If string, `"gelu"`, + `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + speech_encoder_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all layers in the speech encoder. + add_adapter (`bool`, *optional*, defaults to `True`): + Add an adapter layer on top of the speech encoder. + speech_encoder_layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability for the speech encoder. See the [LayerDrop paper](see + https://huggingface.co/papers/1909.11556) for more details. + feature_projection_input_dim (`int`, *optional*, defaults to 160): + Input dimension of the input feature projection of the speech encoder, i.e the dimension after processing + input audios with [`SeamlessM4TFeatureExtractor`]. + num_conv_pos_embeddings (`int`, *optional*, defaults to 128): + Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer of the speech encoder. + num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): + Number of groups of 1D convolutional positional embeddings layer of the speech encoder. + adaptor_kernel_size (`int`, *optional*, defaults to 8): + Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adaptor_stride (`int`, *optional*, defaults to 8): + Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adaptor_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all layers in the speech adapter. + num_adapter_layers (`int`, *optional*, defaults to 1): + Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + position_embeddings_type (`str`, *optional*, defaults to `"relative"`): + Can be specified to `relative` or `rotary` for relative or rotary position embeddings respectively. If left + `None` no relative position embedding is applied. Only applied to the speech encoder. + rotary_embedding_base (`int`, *optional*, defaults to 10000): + If `"rotary"` position embeddings are used, defines the size of the embedding base. Only applied to the + speech encoder. + max_source_positions (`int`, *optional*, defaults to 4096): + if `"relative"` position embeddings are used, defines the maximum source input positions. Only applied to + the speech encoder. + conv_depthwise_kernel_size (`int`, *optional*, defaults to 31): + Kernel size of convolutional depthwise 1D layer in Conformer blocks. Only applied to the speech encoder. + + > Text-To-Unit (t2u) model specific parameters + + t2u_bos_token_id (`int`, *optional*, defaults to 0): + The id of the _beginning-of-stream_ unit token. Only applied to the text-to-unit seq2seq model. + t2u_pad_token_id (`int`, *optional*, defaults to 1): + The id of the _padding_ unit token. Only applied to the text-to-unit seq2seq model. + t2u_eos_token_id (`int`, *optional*, defaults to 2): + The id of the _end-of-stream_ unit token. Only applied to the text-to-unit seq2seq model. + t2u_decoder_start_token_id (`int`, *optional*, defaults to 2): + If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only + applied to the text-to-unit seq2seq model. + t2u_max_new_tokens (`int`, *optional*, defaults to 1024): + The maximum numbers of unit tokens to generate, ignoring the number of tokens in the prompt. Only applied + to the text-to-unit seq2seq model. + t2u_encoder_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer text-to-unit encoder. + t2u_encoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit encoder. + t2u_encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text-to-unit encoder. + t2u_decoder_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer text-to-unit decoder. + t2u_decoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit decoder. + t2u_decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text-to-unit decoder. + t2u_max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model text-to-unit component might ever be used with. Typically set + this to something large just in case (e.g., 512 or 1024 or 2048). + + > Hifi-Gan Vocoder specific parameters + + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the output audio will be generated, expressed in hertz (Hz). + upsample_initial_channel (`int`, *optional*, defaults to 512): + The number of input channels into the hifi-gan upsampling network. Applies to the vocoder only. + upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[5, 4, 4, 2, 2]`): + A tuple of integers defining the stride of each 1D convolutional layer in the vocoder upsampling network. + The length of *upsample_rates* defines the number of convolutional layers and has to match the length of + *upsample_kernel_sizes*. Applies to the vocoder only. + upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[11, 8, 8, 4, 4]`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the vocoder upsampling + network. The length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match + the length of *upsample_rates*. Applies to the vocoder only. + resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`): + A tuple of integers defining the kernel sizes of the vocoder 1D convolutional layers in the multi-receptive + field fusion (MRF) module. Applies to the vocoder only. + resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`): + A nested tuple of integers defining the dilation rates of the vocoder dilated 1D convolutional layers in + the multi-receptive field fusion (MRF) module. Applies to the vocoder only. + leaky_relu_slope (`float`, *optional*, defaults to 0.1): + The angle of the negative slope used by the leaky ReLU activation in the vocoder. Applies to the vocoder + only. + unit_hifi_gan_vocab_size (`int`, *optional*, defaults to 10000): + Vocabulary size of the SeamlessM4T vocoder. Defines the number of different unit tokens that can be + represented by the `inputs_ids` passed when calling the vocoder of [`~SeamlessM4TModel`], + [`~SeamlessM4TForSpeechToSpeech`] or [`~SeamlessM4TForTextToSpeech`]. + unit_embed_dim (`int`, *optional*, defaults to 1280): + The projection dimension of the input ids given to the hifi-gan vocoder. Applies to the vocoder only. + lang_embed_dim (`int`, *optional*, defaults to 256): + The projection dimension of the target language given to the hifi-gan vocoder. Applies to the vocoder only. + spkr_embed_dim (`int`, *optional*, defaults to 256): + The projection dimension of the speaker id given to the hifi-gan vocoder. Applies to the vocoder only. + vocoder_num_langs (`int`, *optional*, defaults to 36): + Number of langs supported by the vocoder. Might be different from `t2u_num_langs`. + vocoder_num_spkrs (`int`, *optional*, defaults to 200): + Number of speakers supported by the vocoder. + variance_predictor_kernel_size (`int`, *optional*, defaults to 3): + Kernel size of the duration predictor. Applies to the vocoder only. + var_pred_dropout (`float`, *optional*, defaults to 0.5): + The dropout probability of the duration predictor. Applies to the vocoder only. + vocoder_offset (`int`, *optional*, defaults to 4): + Offset the unit token ids by this number to account for symbol tokens. Applies to the vocoder only. + + ```python + >>> from transformers import SeamlessM4TModel, SeamlessM4TConfig + + >>> # Initializing a SeamlessM4T "facebook/hf-seamless-m4t-medium" style configuration + >>> configuration = SeamlessM4TConfig() + + >>> # Initializing a model from the "facebook/hf-seamless-m4t-medium" style configuration + >>> model = SeamlessM4TModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "seamless_m4t" + + def __init__( + self, + vocab_size=256102, + t2u_vocab_size=10082, + # shared config + hidden_size=1024, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + max_position_embeddings=1024, + is_encoder_decoder=True, + encoder_layerdrop=0.05, + decoder_layerdrop=0.05, + activation_function="relu", + dropout=0.1, + attention_dropout=0.1, + activation_dropout=0.0, + scale_embedding=True, + # text encoder|decoder + encoder_layers=24, + encoder_ffn_dim=8192, + encoder_attention_heads=16, + decoder_layers=24, + decoder_ffn_dim=8192, + decoder_attention_heads=16, + decoder_start_token_id=3, + max_new_tokens=256, + pad_token_id=0, + bos_token_id=2, + eos_token_id=3, + # speech_encoder + speech_encoder_layers=24, + speech_encoder_attention_heads=16, + speech_encoder_intermediate_size=4096, + speech_encoder_hidden_act="swish", + speech_encoder_dropout=0.0, + add_adapter=True, + speech_encoder_layerdrop=0.1, + feature_projection_input_dim=160, + num_conv_pos_embeddings=128, + num_conv_pos_embedding_groups=16, + adaptor_kernel_size=8, + adaptor_stride=8, + adaptor_dropout=0.1, + num_adapter_layers=1, + position_embeddings_type="relative", + rotary_embedding_base=10000, + max_source_positions=4096, + conv_depthwise_kernel_size=31, + # t2u config + t2u_bos_token_id=0, + t2u_pad_token_id=1, + t2u_eos_token_id=2, + t2u_decoder_start_token_id=2, + t2u_max_new_tokens=1024, + t2u_encoder_layers=6, + t2u_encoder_ffn_dim=8192, + t2u_encoder_attention_heads=16, + t2u_decoder_layers=6, + t2u_decoder_ffn_dim=8192, + t2u_decoder_attention_heads=16, + t2u_max_position_embeddings=2048, + # hifi-gan vocoder config + sampling_rate=16000, + upsample_initial_channel=512, + upsample_rates=[5, 4, 4, 2, 2], + upsample_kernel_sizes=[11, 8, 8, 4, 4], + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + leaky_relu_slope=0.1, + # specific to Code Hifi-Gan + unit_hifi_gan_vocab_size=10000, + unit_embed_dim=1280, + lang_embed_dim=256, + spkr_embed_dim=256, + vocoder_num_langs=36, + vocoder_num_spkrs=200, + variance_predictor_kernel_size=3, + var_pred_dropout=0.5, + vocoder_offset=4, + **kwargs, + ): + # overall_config + self.vocab_size = vocab_size + self.t2u_vocab_size = t2u_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.max_position_embeddings = max_position_embeddings + self.use_cache = use_cache + self.max_new_tokens = max_new_tokens + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.activation_function = activation_function + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.scale_embedding = scale_embedding + # for proper config init + self.num_attention_heads = decoder_attention_heads + self.num_hidden_layers = decoder_layers + + # text|unit encoder|decoder + self.encoder_layers = encoder_layers + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_attention_heads = encoder_attention_heads + self.decoder_layers = decoder_layers + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_attention_heads = decoder_attention_heads + + # speech_encoder + self.speech_encoder_layers = speech_encoder_layers + self.speech_encoder_hidden_act = speech_encoder_hidden_act + self.speech_encoder_dropout = speech_encoder_dropout + self.speech_encoder_attention_heads = speech_encoder_attention_heads + self.speech_encoder_layerdrop = speech_encoder_layerdrop + self.speech_encoder_intermediate_size = speech_encoder_intermediate_size + self.feature_projection_input_dim = feature_projection_input_dim + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.adaptor_kernel_size = adaptor_kernel_size + self.adaptor_stride = adaptor_stride + self.adaptor_dropout = adaptor_dropout + self.num_adapter_layers = num_adapter_layers + self.position_embeddings_type = position_embeddings_type + self.rotary_embedding_base = rotary_embedding_base + self.max_source_positions = max_source_positions + self.conv_depthwise_kernel_size = conv_depthwise_kernel_size + self.add_adapter = add_adapter + + # t2u config + self.t2u_bos_token_id = t2u_bos_token_id + self.t2u_pad_token_id = t2u_pad_token_id + self.t2u_eos_token_id = t2u_eos_token_id + self.t2u_decoder_start_token_id = t2u_decoder_start_token_id + self.t2u_max_new_tokens = t2u_max_new_tokens + self.t2u_encoder_layers = t2u_encoder_layers + self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim + self.t2u_encoder_attention_heads = t2u_encoder_attention_heads + self.t2u_decoder_layers = t2u_decoder_layers + self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim + self.t2u_decoder_attention_heads = t2u_decoder_attention_heads + self.t2u_max_position_embeddings = t2u_max_position_embeddings + + # hifi-gan vocoder config + # original parameters specific to Hifi-Gan + self.sampling_rate = sampling_rate + self.upsample_initial_channel = upsample_initial_channel + self.upsample_rates = upsample_rates + self.upsample_kernel_sizes = upsample_kernel_sizes + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.leaky_relu_slope = leaky_relu_slope + + # specific to Code Hifi-Gan + self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size + self.unit_embed_dim = unit_embed_dim + self.lang_embed_dim = lang_embed_dim + self.spkr_embed_dim = spkr_embed_dim + self.vocoder_num_langs = vocoder_num_langs + self.vocoder_num_spkrs = vocoder_num_spkrs + self.variance_predictor_kernel_size = variance_predictor_kernel_size + self.var_pred_dropout = var_pred_dropout + self.vocoder_offset = vocoder_offset + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + is_encoder_decoder=is_encoder_decoder, + max_position_embeddings=max_position_embeddings, + **kwargs, + ) + + +__all__ = ["SeamlessM4TConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py new file mode 100644 index 0000000000000000000000000000000000000000..59a20059f869ecca947d0999d3239573fbf10194 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py @@ -0,0 +1,309 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Feature extractor class for SeamlessM4T +""" + +from typing import Optional, Union + +import numpy as np + +from ...utils import is_torch_available + + +if is_torch_available(): + import torch + +from ...audio_utils import mel_filter_bank, spectrogram, window_function +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import PaddingStrategy, TensorType, logging + + +logger = logging.get_logger(__name__) + + +class SeamlessM4TFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a SeamlessM4T feature extractor. + + This feature extractor inherits from [`SequenceFeatureExtractor`] which contains most of the main methods. Users + should refer to this superclass for more information regarding those methods. + + This class extracts mel-filter bank features from raw speech. + + Args: + feature_size (`int`, *optional*, defaults to 80): + The feature dimension of the extracted features. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). + num_mel_bins (`int`, *optional*, defaults to 80): + Number of Mel-frequency bins. + padding_value (`float`, *optional*, defaults to 0.0): + The value that is used to fill the padding vectors. + stride (`int`, *optional*, defaults to 2): + Stride used to reshape audios from shape (batch_size,num_frames,num_mel_bins) to + (batch_size,num_frames//stride,num_mel_bins*stride). + """ + + model_input_names = ["input_features", "attention_mask"] + + def __init__( + self, + feature_size=80, + sampling_rate=16000, + num_mel_bins=80, + padding_value=0.0, + stride=2, + **kwargs, + ): + self.num_mel_bins = num_mel_bins + self.return_attention_mask = True + self.stride = stride + + mel_filters = mel_filter_bank( + num_frequency_bins=257, + num_mel_filters=self.num_mel_bins, + min_frequency=20, + max_frequency=sampling_rate // 2, + sampling_rate=sampling_rate, + norm=None, + mel_scale="kaldi", + triangularize_in_mel_space=True, + ) + + self.mel_filters = mel_filters + self.window = window_function(400, "povey", periodic=False) + + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + + @staticmethod + # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm + def zero_mean_unit_var_norm( + input_values: list[np.ndarray], attention_mask: list[np.ndarray], padding_value: float = 0.0 + ) -> list[np.ndarray]: + """ + Every array in the list is normalized to have zero mean and unit variance + """ + if attention_mask is not None: + attention_mask = np.array(attention_mask, np.int32) + normed_input_values = [] + + for vector, length in zip(input_values, attention_mask.sum(-1)): + normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7) + if length < normed_slice.shape[0]: + normed_slice[length:] = padding_value + + normed_input_values.append(normed_slice) + else: + normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values] + + return normed_input_values + + def _extract_fbank_features( + self, + waveform: np.ndarray, + ) -> np.ndarray: + """ + Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs + and hence the waveform should not be normalized before feature extraction. + """ + # by default, it extracts the left channel if stereo + if len(waveform.shape) == 2: + waveform = waveform[0] + + waveform = np.squeeze(waveform) * (2**15) # Kaldi compliance: 16-bit signed integers + features = spectrogram( + waveform, + self.window, + frame_length=400, + hop_length=160, + fft_length=512, + power=2.0, + center=False, + preemphasis=0.97, + mel_filters=self.mel_filters, + log_mel="log", + mel_floor=1.192092955078125e-07, + remove_dc_offset=True, + ).T + return features + + def __call__( + self, + raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + padding: Union[bool, str, PaddingStrategy] = True, + pad_to_multiple_of: Optional[int] = 2, + max_length: Optional[int] = None, + truncation: bool = False, + return_tensors: Optional[Union[str, TensorType]] = None, + sampling_rate: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + do_normalize_per_mel_bins: Optional[bool] = True, + **kwargs, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). + + Args: + raw_speech (`np.ndarray`, `torch.Tensor`, `list[float]`, `list[np.ndarray]`, `list[torch.Tensor]`, + `list[list[float]]`, `list[list[list[float]]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, + a torch tensor, a list of float values, a list of numpy arrays, a list of torch tensors, + a list of list of float values or a list of a list of list of float values. + If `raw_speech` is a one-dimensional `np.ndarray`, `torch.Tensor` or a `list[float]`, `raw_speech` is + considered a single-channel, single-sample sound. In all other cases, the first dimension of + `raw_speech`, whether from an `np.ndarray`, a `torch.Tensor` or a `list[...]`, + corresponds to the number of samples in the batch, and the number of channels + (i.e. mono or stereo character) is derived from the other dimensions + (1D -> single-channel waveform batches; 2D-> stereo-channel waveform batches). + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + pad_to_multiple_of (`int`, *optional*, defaults to 2): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + truncation (`bool`): + Activates truncation to cut input sequences longer than *max_length* to *max_length*. + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific feature_extractor's default. + + [What are attention masks?](../glossary#attention-mask) + + + + For SeamlessM4T models, `attention_mask` should always be passed for batched inference, to avoid subtle + bugs. + + + + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + do_normalize_per_mel_bins (`bool`, *optional*, defaults to `True`): + Whether or not to zero-mean unit-variance normalize the input per mel-channel. + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to the tokenizer or the feature + extractor. + """ + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of" + f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with" + f" {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + return_attention_mask = ( + return_attention_mask if return_attention_mask is not None else self.return_attention_mask + ) + + is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 + if is_batched_numpy and len(raw_speech.shape) > 3: + raise ValueError(f"Only mono-channel or stereo-channel audio is supported for input to {self}") + + acceptable_types = ( + (torch.Tensor, np.ndarray, tuple, list) if is_torch_available() else (np.ndarray, tuple, list) + ) + is_batched = is_batched_numpy or ( + isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], acceptable_types)) + ) + + if is_batched: + raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech, dtype=np.float32) + elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64): + raw_speech = raw_speech.astype(np.float32) + + # always return batch + if not is_batched: + raw_speech = [raw_speech] + + # extract fbank features + features = [self._extract_fbank_features(waveform) for waveform in raw_speech] + + if do_normalize_per_mel_bins: + # torch defaults to ddof=1, and numpy defaults to ddof=0 + features = [ + (x - np.expand_dims(x.mean(0), 0)) / np.sqrt(np.expand_dims(x.var(0, ddof=1), 0) + 1e-7) + for x in features + ] + + # convert into correct format for padding + encoded_inputs = BatchFeature({"input_features": features}) + + padded_inputs = self.pad( + encoded_inputs, + padding=padding, + max_length=max_length, + truncation=truncation, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=True, + return_tensors="np", + ) + + # SeamlessM4T needs to process extracted features + input_features = padded_inputs.get("input_features") + attention_mask = padded_inputs.pop("attention_mask") + + batch_size, num_frames, num_channels = input_features.shape + + remainder = num_frames % self.stride + if remainder != 0: + input_features = input_features[:, : num_frames - remainder, :] + attention_mask = attention_mask[:, : num_frames - remainder] + + input_features = np.reshape( + input_features, (batch_size, num_frames // self.stride, num_channels * self.stride) + ) + + indices = np.arange(0, num_frames - remainder) + attention_mask = attention_mask[:, indices % self.stride == 1] + + padded_inputs["input_features"] = input_features + if return_attention_mask: + padded_inputs["attention_mask"] = attention_mask + + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + return padded_inputs + + +__all__ = ["SeamlessM4TFeatureExtractor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/modeling_seamless_m4t.py new file mode 100644 index 0000000000000000000000000000000000000000..fbe1d2fb714dae73684d3a4a5f77395ec1837144 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -0,0 +1,4070 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch SeamlessM4T model.""" + +import copy +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import Tensor, nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_attn_mask_utils import ( + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, +) +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Wav2Vec2BaseModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_seamless_m4t import SeamlessM4TConfig + + +logger = logging.get_logger(__name__) + +SEAMLESS_M4T_COMMON_CUSTOM_ARGS = r""" + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. +""" + + +@dataclass +@auto_docstring( + custom_intro=""" + Class defining the generated outputs from [`SeamlessM4TModel`], [`SeamlessM4TForTextToText`], + [`SeamlessM4TForTextToSpeech`], [`SeamlessM4TForSpeechToSpeech`] and [`SeamlessM4TForTextToSpeech`]. + """ +) +class SeamlessM4TGenerationOutput(ModelOutput): + r""" + waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + The final audio waveform predicted by the model. + waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*): + The length in samples of each element in the `waveform` batch. + sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + The generated translated sequences. This is the output of the text-to-text or the speech-to-text models. + The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished + early due to the `eos_token_id`. + unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*): + The generated translated unit sequences. This is the output of the text-to-units model. The second + dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished + early due to the `t2u_eos_token_id`. + """ + + waveform: Optional[torch.FloatTensor] = None + waveform_lengths: Optional[torch.IntTensor] = None + sequences: Optional[tuple[torch.FloatTensor]] = None + unit_sequences: Optional[tuple[torch.FloatTensor]] = None + + +############ UTILS ################ + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +# Copied from transformers.models.bart.modeling_bart.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor): + """ + Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that + stops at the corresponding element in `seq_lens`. + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`): + The sequences to mask, where `*` is any number of sequence-specific dimensions including none. + seq_lens (`torch.Tensor` of shape `(batch)`: + Each element represents the length of the sequence at the same index in `hidden_states` + + Returns: + `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)` + """ + batch_size, mask_seq_len = hidden_states.shape[:2] + + indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1) + + bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len) + + mask = hidden_states.new_ones((batch_size, mask_seq_len)) + + mask = mask.masked_fill(bool_mask, 0) + + return mask + + +def format_speech_generation_kwargs(kwargs): + """ + Format kwargs for SeamlessM4T models that generate speech, attribute kwargs to either the text generation or the + speech generation models. + + Args: + kwargs (`dict`)`: + Keyword arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + """ + # attribute kwargs to models + kwargs_text = {} + kwargs_speech = {} + for key, value in kwargs.items(): + if key.startswith("text_"): + key = key[len("text_") :] + kwargs_text[key] = value + elif key.startswith("speech_"): + key = key[len("speech_") :] + kwargs_speech[key] = value + elif key == "generation_config": + kwargs_text[key] = value + else: + # If the key is already in a specific config, then it's been set with a + # submodules specific value and we don't override + if key not in kwargs_text: + kwargs_text[key] = value + if key not in kwargs_speech: + kwargs_speech[key] = value + return kwargs_text, kwargs_speech + + +############ SPEECH ENCODER related code ################ + + +# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->SeamlessM4TConformer, feat_extract_activation->speech_encoder_hidden_act +class SeamlessM4TConformerPositionalConvEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=config.num_conv_pos_embeddings, + padding=config.num_conv_pos_embeddings // 2, + groups=config.num_conv_pos_embedding_groups, + ) + + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): + self.conv = weight_norm(self.conv, name="weight", dim=2) + if hasattr(self.conv, "parametrizations"): + weight_g = self.conv.parametrizations.weight.original0 + weight_v = self.conv.parametrizations.weight.original1 + else: + weight_g = self.conv.weight_g + weight_v = self.conv.weight_v + deepspeed.zero.register_external_parameter(self, weight_v) + deepspeed.zero.register_external_parameter(self, weight_g) + else: + self.conv = weight_norm(self.conv, name="weight", dim=2) + + self.padding = SeamlessM4TConformerSamePadLayer(config.num_conv_pos_embeddings) + self.activation = ACT2FN[config.speech_encoder_hidden_act] + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + + hidden_states = self.conv(hidden_states) + hidden_states = self.padding(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRotaryPositionalEmbedding with Wav2Vec2->SeamlessM4T, num_attention_heads->speech_encoder_attention_heads +class SeamlessM4TConformerRotaryPositionalEmbedding(nn.Module): + """Rotary positional embedding + Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864 + """ + + def __init__(self, config): + super().__init__() + dim = config.hidden_size // config.speech_encoder_attention_heads + base = config.rotary_embedding_base + + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + self.cached_sequence_length = None + self.cached_rotary_positional_embedding = None + + def forward(self, hidden_states): + sequence_length = hidden_states.shape[1] + + if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None: + return self.cached_rotary_positional_embedding + + self.cached_sequence_length = sequence_length + # Embeddings are computed in the dtype of the inv_freq constant + time_stamps = torch.arange(sequence_length).type_as(self.inv_freq) + freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq) + embeddings = torch.cat((freqs, freqs), dim=-1) + + cos_embeddings = embeddings.cos()[:, None, None, :] + sin_embeddings = embeddings.sin()[:, None, None, :] + # Computed embeddings are cast to the dtype of the hidden state inputs + self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states) + return self.cached_rotary_positional_embedding + + +# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRelPositionalEmbedding with Wav2Vec2->SeamlessM4T +class SeamlessM4TConformerRelPositionalEmbedding(nn.Module): + """Relative positional encoding module.""" + + def __init__(self, config): + super().__init__() + self.max_len = config.max_source_positions + self.d_model = config.hidden_size + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pe(self, x): + # Reset the positional encodings + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` is the position of query vector and `j` is the + # position of key vector. We use positive relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (iSeamlessM4T +class SeamlessM4TConformerSamePadLayer(nn.Module): + def __init__(self, num_conv_pos_embeddings): + super().__init__() + self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 + + def forward(self, hidden_states): + if self.num_pad_remove > 0: + hidden_states = hidden_states[:, :, : -self.num_pad_remove] + return hidden_states + + +class SeamlessM4TConformerFeatureProjection(nn.Module): + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps) + self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size) + self.dropout = nn.Dropout(config.speech_encoder_dropout) + + def forward(self, hidden_states): + # non-projected hidden states are needed for quantization + norm_hidden_states = self.layer_norm(hidden_states) + hidden_states = self.projection(norm_hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class SeamlessM4TConformerFeedForward(nn.Module): + def __init__(self, config, act_fn=None, dropout=None): + super().__init__() + dropout = dropout if dropout is not None else config.speech_encoder_dropout + act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act + + self.intermediate_dropout = nn.Dropout(dropout) + self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size) + self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn + + self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size) + self.output_dropout = nn.Dropout(dropout) + + def forward(self, hidden_states): + hidden_states = self.intermediate_dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.intermediate_dropout(hidden_states) + + hidden_states = self.output_dense(hidden_states) + hidden_states = self.output_dropout(hidden_states) + return hidden_states + + +class SeamlessM4TConformerConvolutionModule(nn.Module): + """Convolution block used in the conformer block""" + + def __init__(self, config): + super().__init__() + if (config.conv_depthwise_kernel_size - 1) % 2 == 1: + raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding") + self.layer_norm = nn.LayerNorm(config.hidden_size) + self.pointwise_conv1 = nn.Conv1d( + config.hidden_size, + 2 * config.hidden_size, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.glu = nn.GLU(dim=1) + self.depthwise_conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + config.conv_depthwise_kernel_size, + stride=1, + padding="same", + groups=config.hidden_size, + bias=False, + ) + self.batch_norm = nn.BatchNorm1d(config.hidden_size) + self.activation = ACT2FN[config.speech_encoder_hidden_act] + self.pointwise_conv2 = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.dropout = nn.Dropout(config.speech_encoder_dropout) + + def forward(self, hidden_states, attention_mask=None): + hidden_states = self.layer_norm(hidden_states) + + # Ensure that we do not leak padded positions in depthwise convolution. + # Put 0 where necessary + if attention_mask is not None: + hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0) + + # exchange the temporal dimension and the feature dimension + hidden_states = hidden_states.transpose(1, 2) + + # GLU mechanism + # => (batch, 2*channel, dim) + hidden_states = self.pointwise_conv1(hidden_states) + # => (batch, channel, dim) + hidden_states = self.glu(hidden_states) + + # 1D Depthwise Conv + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = self.pointwise_conv2(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class SeamlessM4TConformerSelfAttention(nn.Module): + """Construct a SeamlessM4TConformerSelfAttention object. + Can be enhanced with rotary or relative position embeddings. + """ + + def __init__(self, config, use_position_embeddings=True): + super().__init__() + + self.head_size = config.hidden_size // config.speech_encoder_attention_heads + self.num_heads = config.speech_encoder_attention_heads + self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None + + self.linear_q = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_k = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_v = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_out = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(p=config.speech_encoder_dropout) + + if self.position_embeddings_type == "relative": + # linear transformation for positional encoding + self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size)) + self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size)) + + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + relative_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + # self-attention mechanism + batch_size, sequence_length, hidden_size = hidden_states.size() + + # make sure query/key states can be != value states + query_key_states = hidden_states + value_states = hidden_states + + if self.position_embeddings_type == "rotary": + if relative_position_embeddings is None: + raise ValueError( + "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'" + ) + query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings) + + # project query_key_states and value_states + query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size) + + # => (batch, head, time1, d_k) + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + + if self.position_embeddings_type == "relative": + if relative_position_embeddings is None: + raise ValueError( + "`relative_position_embeddings` has to be defined when `self.position_embeddings_type ==" + " 'relative'" + ) + # apply relative_position_embeddings to qk scores + # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860 + scores = self._apply_relative_embeddings( + query=query, key=key, relative_position_embeddings=relative_position_embeddings + ) + else: + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size) + + # apply attention_mask if necessary + if attention_mask is not None: + scores = scores + attention_mask + + # => (batch, head, time1, time2) + probs = torch.softmax(scores, dim=-1) + probs = self.dropout(probs) + + # => (batch, head, time1, d_k) + hidden_states = torch.matmul(probs, value) + + # => (batch, time1, hidden_size) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size) + hidden_states = self.linear_out(hidden_states) + + return hidden_states, probs + + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_rotary_embedding + def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings): + batch_size, sequence_length, hidden_size = hidden_states.size() + hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size) + + cos = relative_position_embeddings[0, :sequence_length, ...] + sin = relative_position_embeddings[1, :sequence_length, ...] + + # rotate hidden_states with rotary embeddings + hidden_states = hidden_states.transpose(0, 1) + rotated_states_begin = hidden_states[..., : self.head_size // 2] + rotated_states_end = hidden_states[..., self.head_size // 2 :] + rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1) + hidden_states = (hidden_states * cos) + (rotated_states * sin) + hidden_states = hidden_states.transpose(0, 1) + + hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size) + + return hidden_states + + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_relative_embeddings + def _apply_relative_embeddings(self, query, key, relative_position_embeddings): + # 1. project positional embeddings + # => (batch, head, 2*time1-1, d_k) + proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings) + proj_relative_position_embeddings = proj_relative_position_embeddings.view( + relative_position_embeddings.size(0), -1, self.num_heads, self.head_size + ) + proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2) + proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3) + + # 2. Add bias to query + # => (batch, head, time1, d_k) + query = query.transpose(1, 2) + q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2) + q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2) + + # 3. attention score: first compute matrix a and matrix c + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + # => (batch, head, time1, time2) + scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1)) + + # 4. then compute matrix b and matrix d + # => (batch, head, time1, 2*time1-1) + scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings) + + # 5. shift matrix b and matrix d + zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype) + scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1) + scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2]) + scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape) + scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd) + scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1] + + # 6. sum matrices + # => (batch, head, time1, time2) + scores = (scores_ac + scores_bd) / math.sqrt(self.head_size) + + return scores + + +class SeamlessM4TConformerEncoderLayer(GradientCheckpointingLayer): + """Conformer block based on https://huggingface.co/papers/2005.08100.""" + + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4T, attention_dropout->speech_encoder_dropout, torch.nn->nn + def __init__(self, config): + super().__init__() + embed_dim = config.hidden_size + dropout = config.speech_encoder_dropout + + # Feed-forward 1 + self.ffn1_layer_norm = nn.LayerNorm(embed_dim) + self.ffn1 = SeamlessM4TConformerFeedForward(config) + + # Self-Attention + self.self_attn_layer_norm = nn.LayerNorm(embed_dim) + self.self_attn_dropout = nn.Dropout(dropout) + self.self_attn = SeamlessM4TConformerSelfAttention(config) + + # Conformer Convolution + self.conv_module = SeamlessM4TConformerConvolutionModule(config) + + # Feed-forward 2 + self.ffn2_layer_norm = nn.LayerNorm(embed_dim) + self.ffn2 = SeamlessM4TConformerFeedForward(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + + def forward( + self, + hidden_states, + attention_mask: Optional[torch.Tensor] = None, + relative_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + conv_attention_mask: Optional[torch.Tensor] = None, + ): + # 1. Feed-Forward 1 layer + residual = hidden_states + hidden_states = self.ffn1_layer_norm(hidden_states) + hidden_states = self.ffn1(hidden_states) + hidden_states = hidden_states * 0.5 + residual + residual = hidden_states + + # 2. Self-Attention layer + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weigts = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + relative_position_embeddings=relative_position_embeddings, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_dropout(hidden_states) + hidden_states = hidden_states + residual + + # 3. Convolutional Layer + residual = hidden_states + hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask) + hidden_states = residual + hidden_states + + # 4. Feed-Forward 2 Layer + residual = hidden_states + hidden_states = self.ffn2_layer_norm(hidden_states) + hidden_states = self.ffn2(hidden_states) + hidden_states = hidden_states * 0.5 + residual + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, attn_weigts + + +class SeamlessM4TConformerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + if config.position_embeddings_type == "relative": + self.embed_positions = SeamlessM4TConformerRelPositionalEmbedding(config) + elif config.position_embeddings_type == "rotary": + self.embed_positions = SeamlessM4TConformerRotaryPositionalEmbedding(config) + else: + self.embed_positions = None + + self.dropout = nn.Dropout(config.speech_encoder_dropout) + self.layers = nn.ModuleList( + [SeamlessM4TConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)] + ) + + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + conv_attention_mask = attention_mask + if attention_mask is not None: + # make sure padded tokens output 0 + hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0) + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + hidden_states = self.dropout(hidden_states) + + if self.embed_positions is not None: + relative_position_embeddings = self.embed_positions(hidden_states) + else: + relative_position_embeddings = None + + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.config.speech_encoder_layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + relative_position_embeddings=relative_position_embeddings, + output_attentions=output_attentions, + conv_attention_mask=conv_attention_mask, + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class SeamlessM4TConformerAdapterLayer(nn.Module): + def __init__(self, config): + super().__init__() + embed_dim = config.hidden_size + dropout = config.adaptor_dropout + + self.kernel_size = config.adaptor_kernel_size + self.stride = config.adaptor_stride + + # 1. residual convolution + self.residual_layer_norm = nn.LayerNorm(embed_dim) + self.residual_conv = nn.Conv1d( + embed_dim, + 2 * embed_dim, + self.kernel_size, + stride=self.stride, + padding=self.stride // 2, + ) + self.activation = nn.GLU(dim=1) + + # Self-Attention + self.self_attn_layer_norm = nn.LayerNorm(embed_dim) + self.self_attn_conv = nn.Conv1d( + embed_dim, + 2 * embed_dim, + self.kernel_size, + stride=self.stride, + padding=self.stride // 2, + ) + self.self_attn = SeamlessM4TConformerSelfAttention(config, use_position_embeddings=False) + self.self_attn_dropout = nn.Dropout(dropout) + + # Feed-forward + self.ffn_layer_norm = nn.LayerNorm(embed_dim) + self.ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=dropout) + + def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask): + pad = self.kernel_size // 2 + seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1) + + seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1 + + return seq_lens.floor() + + def forward( + self, + hidden_states, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + residual = self.residual_layer_norm(hidden_states) + + # Apply pooling to the residual to match the sequence length of the + # multi-head attention output. + # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len) + residual = residual.transpose(1, 2) + residual = self.residual_conv(residual) + residual = self.activation(residual) + # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim) + residual = residual.transpose(1, 2) + + hidden_states = self.self_attn_layer_norm(hidden_states) + # Apply pooling before feeding to the multihead-attention layer. + # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len) + hidden_states = hidden_states.transpose(1, 2) + hidden_states = self.self_attn_conv(hidden_states) + hidden_states = self.activation(hidden_states) + # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim) + hidden_states = hidden_states.transpose(1, 2) + + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + hidden_states.device + ) + attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths) + attention_mask = _prepare_4d_attention_mask( + attention_mask, + hidden_states.dtype, + ) + + # The rest of the computation is identical to a vanilla Transformer + # encoder layer. + hidden_states, attn_weights = self.self_attn( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_dropout(hidden_states) + hidden_states = hidden_states + residual + + residual = hidden_states + + hidden_states = self.ffn_layer_norm(hidden_states) + hidden_states = self.ffn(hidden_states) + residual + + return hidden_states + + +class SeamlessM4TConformerAdapter(nn.Module): + def __init__(self, config): + super().__init__() + + self.layers = nn.ModuleList(SeamlessM4TConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)) + + def forward(self, hidden_states, attention_mask): + # down project hidden_states if necessary + + for layer in self.layers: + hidden_states = layer(hidden_states, attention_mask) + + return hidden_states + + +############ TEXT / UNITS related code ################ + + +# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding with M2M100->SeamlessM4T +class SeamlessM4TScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding +class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module): + """This module produces sinusoidal positional embeddings of any length.""" + + def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None): + super().__init__() + self.offset = 2 + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.make_weights(num_positions + self.offset, embedding_dim, padding_idx) + + def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx) + if hasattr(self, "weights"): + # in forward put the weights on the correct dtype and device of the param + emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) + + self.register_buffer("weights", emb_weights, persistent=False) + + @staticmethod + def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + """ + Build sinusoidal embeddings. + + This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of + "Attention Is All You Need". + """ + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb) + emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) + if embedding_dim % 2 == 1: + # zero pad + emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) + if padding_idx is not None: + emb[padding_idx, :] = 0 + + return emb.to(torch.get_default_dtype()) + + @torch.no_grad() + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values_length: int = 0, + ): + if input_ids is not None: + bsz, seq_len = input_ids.size() + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to( + input_ids.device + ) + else: + bsz, seq_len = inputs_embeds.size()[:-1] + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length) + + # expand embeddings if needed + max_pos = self.padding_idx + 1 + seq_len + past_key_values_length + if max_pos > self.weights.size(0): + self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) + + return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach() + + def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length + + +class SeamlessM4TAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4T + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[SeamlessM4TConfig] = None, + layer_idx: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if encoder_hidden_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = encoder_hidden_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states = self.k_proj(current_states) + value_states = self.v_proj(current_states) + key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = query_states.view(bsz, tgt_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.reshape(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4T,DenseActDense->FeedForwardNetwork, d_model->hidden_size +class SeamlessM4TFeedForwardNetwork(nn.Module): + def __init__(self, config: SeamlessM4TConfig, ffn_dim: int): + super().__init__() + self.fc1 = nn.Linear(config.hidden_size, ffn_dim) + self.fc2 = nn.Linear(ffn_dim, config.hidden_size) + self.dropout = nn.Dropout(config.activation_dropout) + self.act = ACT2FN[config.activation_function] + + def forward(self, hidden_states): + hidden_states = self.fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.dropout(hidden_states) + if ( + isinstance(self.fc2.weight, torch.Tensor) + and hidden_states.dtype != self.fc2.weight.dtype + and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8) + ): + hidden_states = hidden_states.to(self.fc2.weight.dtype) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class SeamlessM4TEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: SeamlessM4TConfig, encoder_ffn_dim=None, encoder_attention_heads=None): + super().__init__() + encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim + encoder_attention_heads = ( + config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads + ) + + self.embed_dim = config.hidden_size + self.self_attn = SeamlessM4TAttention( + embed_dim=self.embed_dim, + num_heads=encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.attn_dropout = nn.Dropout(config.dropout) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + + self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=encoder_ffn_dim) + + self.ffn_layer_norm = nn.LayerNorm(config.hidden_size) + self.ffn_dropout = nn.Dropout(config.activation_dropout) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: bool = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): + attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very + large negative values. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = self.attn_dropout(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + + hidden_states = self.ffn_layer_norm(hidden_states) + + hidden_states = self.ffn(hidden_states) + hidden_states = self.ffn_dropout(hidden_states) + + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class SeamlessM4TDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: SeamlessM4TConfig, decoder_ffn_dim=None, decoder_attention_heads=None, layer_idx=None): + super().__init__() + decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim + decoder_attention_heads = ( + config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads + ) + + self.embed_dim = config.hidden_size + self.self_attn = SeamlessM4TAttention( + embed_dim=self.embed_dim, + num_heads=decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + layer_idx=layer_idx, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.attn_dropout = nn.Dropout(config.dropout) + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.cross_attention = SeamlessM4TAttention( + self.embed_dim, + decoder_attention_heads, + config.attention_dropout, + is_decoder=True, + layer_idx=layer_idx, + ) + self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim) + + self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=decoder_ffn_dim) + + self.ffn_layer_norm = nn.LayerNorm(config.hidden_size) + self.ffn_dropout = nn.Dropout(config.activation_dropout) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): + attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very + large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): + encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by + very large negative values. + past_key_values (`Cache`): + cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = self.attn_dropout(hidden_states) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.cross_attention_layer_norm(hidden_states) + + hidden_states, cross_attn_weights = self.cross_attention( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = self.attn_dropout(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + + hidden_states = self.ffn_layer_norm(hidden_states) + + hidden_states = self.ffn(hidden_states) + hidden_states = self.ffn_dropout(hidden_states) + + hidden_states = residual + hidden_states + + return hidden_states, self_attn_weights, cross_attn_weights + + +############ SUB-MODELS related code ################ + + +@auto_docstring +class SeamlessM4TPreTrainedModel(PreTrainedModel): + config: SeamlessM4TConfig + base_model_prefix = "seamless_m4t" + supports_gradient_checkpointing = True + _no_split_modules = ["SeamlessM4TEncoderLayer", "SeamlessM4TDecoderLayer", "SeamlessM4TConformerEncoderLayer"] + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, SeamlessM4TConformerSelfAttention): + if hasattr(module, "pos_bias_u"): + nn.init.xavier_uniform_(module.pos_bias_u) + if hasattr(module, "pos_bias_v"): + nn.init.xavier_uniform_(module.pos_bias_v) + elif isinstance(module, SeamlessM4TConformerPositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, SeamlessM4TConformerFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask): + kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride + pad = kernel_size // 2 + seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1) + + seq_lens = ((seq_lens + 2 * pad - kernel_size) / stride) + 1 + + return seq_lens.floor() + + def compute_last_hidden_states_per_sample( + self, + hidden_states: tuple[tuple[torch.Tensor]], + beam_indices: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Computes the last hidden states. + + Parameters: + hidden_states (`tuple[tuple[torch.Tensor]]`): + The generated hidden states. Tuple (one element for each generated token) of tuples (one element for + each layer of the decoder) of torch.FloatTensor of shape (batch_size*num_beams*num_return_sequences, + generated_length, hidden_size). + beam_indices (`torch.LongTensor`, *optional*): + Beam indices of generated token id at each generation step. `torch.LongTensor` of shape + `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at + generate-time. + + Return: + `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length, hidden_size)` + containing + the last hidden states. + ```""" + # 1. First, let's compute last_hidden_states from hidden_states. + # For each generation step, takes the hidden state from the last layer. + # shape: (batch_size*vocab_size*num_return_sequences, # generation_steps, hidden_dim) + last_hidden_states = torch.concat([hidden_states[-1] for hidden_states in hidden_states], dim=1) + + # 2. In absence of `beam_indices`, we can assume that we come from e.g. greedy search, which is equivalent + # to a beam search approach were the first (and only) beam is always selected + # in that case, return directly last_hidden_states + if beam_indices is None: + return last_hidden_states + + # 3. cut beam_indices to longest beam length + beam_indices_mask = beam_indices < 0 + max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max() + beam_indices = beam_indices.clone()[:, :max_beam_length] + beam_indices_mask = beam_indices_mask[:, :max_beam_length] + + # 4. Set indices of beams that finished early to 0; such indices will be masked correctly afterwards anyways + beam_indices[beam_indices_mask] = 0 + + # 5. expand beam_indices to last_hidden_states dim + beam_indices = beam_indices.unsqueeze(-1) + beam_indices = beam_indices.expand(-1, -1, last_hidden_states.shape[-1]) + + # 6. select the right candidate for each beam + # in other words, new_last_hidden_states[i,j,k] = last_hidden_states[beam_indices[i,j,k], j, k] for all i, j, k + last_hidden_states = torch.gather(last_hidden_states, 0, beam_indices) + + return last_hidden_states + + +@auto_docstring( + custom_intro=""" + Transformer speech encoder consisting of *config.speech_encoder_layers* conformer self attention layers. + Each layer is a [`SeamlessM4TConformerEncoderLayer`]. + """ +) +class SeamlessM4TSpeechEncoder(SeamlessM4TPreTrainedModel): + main_input_name = "input_features" + + def __init__(self, config: SeamlessM4TConfig): + super().__init__(config) + + self.feature_projection = SeamlessM4TConformerFeatureProjection(config) + self.encoder = SeamlessM4TConformerEncoder(config) + self.intermediate_ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=0.0) + self.adapter = SeamlessM4TConformerAdapter(config) if config.add_adapter else None + self.inner_layer_norm = nn.LayerNorm(config.hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_features: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, Wav2Vec2BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_features is None: + raise ValueError( + """Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4TSpeechEncoder.forward`. + Make sure one of them is not `None`.""" + ) + + hidden_states = self.feature_projection(input_features) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + expanded_hidden_states = self.intermediate_ffn(hidden_states) + hidden_states = hidden_states + 0.5 * expanded_hidden_states + + if self.adapter is not None: + hidden_states = self.adapter(hidden_states, attention_mask=attention_mask) + + hidden_states = self.inner_layer_norm(hidden_states) + + if not return_dict: + return (hidden_states,) + encoder_outputs[1:] + + return Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +# inspired from MBart and NllbMoe +@auto_docstring( + custom_intro=""" + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4TEncoderLayer`]. + """ +) +class SeamlessM4TEncoder(SeamlessM4TPreTrainedModel): + def __init__( + self, + config: SeamlessM4TConfig, + embed_tokens: Optional[nn.Embedding] = None, + is_t2u_encoder: bool = False, + ): + r""" + embed_tokens (`nn.Embedding`, *optional*): + Input embedding + is_t2u_encoder (`bool`, *optional*, defaults to `False`): + indicates if it belongs to the text-to-units model, in which case it won't have input embeddings + """ + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + embed_dim = config.hidden_size + + self.is_t2u_encoder = is_t2u_encoder + self.max_source_positions = config.max_position_embeddings + + if not self.is_t2u_encoder: + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = SeamlessM4TScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding( + self.max_source_positions, + embed_dim, + self.padding_idx, + ) + + layers = [] + for _ in range(config.encoder_layers): + layers.append( + SeamlessM4TEncoderLayer( + config, + encoder_attention_heads=config.encoder_attention_heads, + encoder_ffn_dim=config.encoder_ffn_dim, + ) + ) + + self.layers = nn.ModuleList(layers) + + self.layer_norm = nn.LayerNorm(config.hidden_size) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, BaseModelOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and self.is_t2u_encoder: + raise ValueError( + "You cannot pass input_ids to the encoder of the text_to_units model. Pass inputs_embeds instead." + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.shape + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if not self.is_t2u_encoder: + embed_pos = self.embed_positions(input) + + hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device) + else: + hidden_states = inputs_embeds + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +@auto_docstring( + custom_intro=""" + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4TDecoderLayer`]. + """ +) +class SeamlessM4TDecoder(SeamlessM4TPreTrainedModel): + def __init__( + self, + config: SeamlessM4TConfig, + embed_tokens: Optional[nn.Embedding] = None, + ): + r""" + embed_tokens (`nn.Embedding`, *optional*): + Input embedding + """ + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + # if embed_tokens defined, use its shape instead + self.embed_tokens = SeamlessM4TScaledWordEmbedding( + embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx, embed_scale=embed_scale + ) + self.embed_tokens.weight = embed_tokens.weight + else: + self.embed_tokens = SeamlessM4TScaledWordEmbedding( + self.vocab_size, config.hidden_size, self.padding_idx, embed_scale=embed_scale + ) + + self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding( + self.max_target_positions, + config.hidden_size, + padding_idx=self.padding_idx, + ) + + layers = [] + for i in range(config.decoder_layers): + layers.append( + SeamlessM4TDecoderLayer( + config, + decoder_attention_heads=config.decoder_attention_heads, + decoder_ffn_dim=config.decoder_ffn_dim, + layer_idx=i, + ) + ) + self.layers = nn.ModuleList(layers) + self.layer_norm = nn.LayerNorm(config.hidden_size) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." + ) + use_cache = False + + # initialize `past_key_values` + if use_cache and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input, past_key_values_length=past_key_values_length) + + hidden_states = inputs_embeds + positions.to(inputs_embeds.device) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + attention_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4TEncoder`] without embeddings and the decoder is a [`SeamlessM4TDecoder`]. + """ +) +class SeamlessM4TTextToUnitModel(SeamlessM4TPreTrainedModel): + def __init__( + self, + config: SeamlessM4TConfig, + embed_tokens_decoder: Optional[nn.Embedding] = None, + ): + r""" + embed_tokens_decoder (`nn.Embedding`, *optional*): + input embedding of the decoder. + """ + super().__init__(config) + + self.encoder = SeamlessM4TEncoder(config, is_t2u_encoder=True) + self.decoder = SeamlessM4TDecoder(config, embed_tokens_decoder) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4TTextToUnit`]. + """ +) +class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = [ + "vocoder", + "speech_encoder", + "text_encoder", + "text_decoder", + ] + _tied_weights_keys = ["lm_head.weight"] + + def __init__( + self, + config: SeamlessM4TConfig, + embed_tokens_decoder: Optional[nn.Embedding] = None, + ): + r""" + embed_tokens_decoder (`nn.Embedding`, *optional*): + input embedding of the decoder. + """ + # update config - used principality for bos_token_id etc. + config = copy.deepcopy(config) + for param, val in config.to_dict().items(): + if param.startswith("t2u_"): + config.__setattr__(param[4:], val) + super().__init__(config) + + self.model = SeamlessM4TTextToUnitModel(config, embed_tokens_decoder) + + self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.encoder + + def get_decoder(self): + return self.model.decoder + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + lm_logits = self.lm_head(outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id) + + def _tie_weights(self) -> None: + if getattr(self.config, "tie_word_embeddings", True): + output_embeddings = self.get_output_embeddings() + if output_embeddings is not None: + self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) + + +############ VOCODER related code ################ + + +# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock +class HifiGanResidualBlock(nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1): + super().__init__() + self.leaky_relu_slope = leaky_relu_slope + + self.convs1 = nn.ModuleList( + [ + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=dilation[i], + padding=self.get_padding(kernel_size, dilation[i]), + ) + for i in range(len(dilation)) + ] + ) + self.convs2 = nn.ModuleList( + [ + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=1, + padding=self.get_padding(kernel_size, 1), + ) + for _ in range(len(dilation)) + ] + ) + + def get_padding(self, kernel_size, dilation=1): + return (kernel_size * dilation - dilation) // 2 + + def apply_weight_norm(self): + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + for layer in self.convs1: + weight_norm(layer) + for layer in self.convs2: + weight_norm(layer) + + def remove_weight_norm(self): + for layer in self.convs1: + nn.utils.remove_weight_norm(layer) + for layer in self.convs2: + nn.utils.remove_weight_norm(layer) + + def forward(self, hidden_states): + for conv1, conv2 in zip(self.convs1, self.convs2): + residual = hidden_states + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = conv1(hidden_states) + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = conv2(hidden_states) + hidden_states = hidden_states + residual + return hidden_states + + +class SeamlessM4TVariancePredictor(nn.Module): + def __init__(self, config): + super().__init__() + + embed_dim = config.unit_embed_dim + kernel_size = config.variance_predictor_kernel_size + var_pred_dropout = config.var_pred_dropout + + self.conv1 = nn.Conv1d( + embed_dim, + embed_dim, + kernel_size=kernel_size, + padding=(kernel_size - 1) // 2, + ) + self.activation_function = nn.ReLU() + self.ln1 = nn.LayerNorm(embed_dim) + self.dropout_module = nn.Dropout(p=var_pred_dropout) + self.conv2 = nn.Conv1d( + embed_dim, + embed_dim, + kernel_size=kernel_size, + padding=1, + ) + self.ln2 = nn.LayerNorm(embed_dim) + self.proj = nn.Linear(embed_dim, 1) + + def forward(self, hidden_states: Tensor) -> Tensor: + # Input: B x T x C; Output: B x T + hidden_states = self.conv1(hidden_states.transpose(1, 2)) + hidden_states = self.activation_function(hidden_states).transpose(1, 2) + hidden_states = self.dropout_module(self.ln1(hidden_states)) + hidden_states = self.conv2(hidden_states.transpose(1, 2)) + hidden_states = self.activation_function(hidden_states).transpose(1, 2) + hidden_states = self.dropout_module(self.ln2(hidden_states)) + return self.proj(hidden_states).squeeze(dim=2) + + +class SeamlessM4THifiGan(nn.Module): + def __init__(self, config: SeamlessM4TConfig): + super().__init__() + model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim + self.leaky_relu_slope = config.leaky_relu_slope + self.num_kernels = len(config.resblock_kernel_sizes) + self.num_upsamples = len(config.upsample_rates) + self.conv_pre = nn.Conv1d( + model_in_dim, + config.upsample_initial_channel, + kernel_size=7, + stride=1, + padding=3, + ) + + self.upsampler = nn.ModuleList() + for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)): + self.upsampler.append( + nn.ConvTranspose1d( + config.upsample_initial_channel // (2**i), + config.upsample_initial_channel // (2 ** (i + 1)), + kernel_size=kernel_size, + stride=upsample_rate, + padding=(kernel_size - upsample_rate) // 2, + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.upsampler)): + channels = config.upsample_initial_channel // (2 ** (i + 1)) + for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes): + self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope)) + + self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3) + + def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor: + r""" + Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch + of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech + waveform. + + Args: + spectrogram (`torch.FloatTensor`): + Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length, + model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim` + is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`. + + Returns: + `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of + shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`. + """ + + hidden_states = self.conv_pre(input_embeds) + for i in range(self.num_upsamples): + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = self.upsampler[i](hidden_states) + + res_state = self.resblocks[i * self.num_kernels](hidden_states) + for j in range(1, self.num_kernels): + res_state += self.resblocks[i * self.num_kernels + j](hidden_states) + hidden_states = res_state / self.num_kernels + + hidden_states = nn.functional.leaky_relu(hidden_states) + hidden_states = self.conv_post(hidden_states) + hidden_states = torch.tanh(hidden_states) + + # remove seq-len dim since this collapses to 1 + waveform = hidden_states.squeeze(1) + + return waveform + + +@auto_docstring( + custom_intro=""" + Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis). + """ +) +class SeamlessM4TCodeHifiGan(PreTrainedModel): + config: SeamlessM4TConfig + main_input_name = "input_embeds" + _no_split_modules = [] + + def __init__(self, config): + super().__init__(config) + + self.pad_token_id = config.t2u_pad_token_id + self.dur_predictor = SeamlessM4TVariancePredictor(config) + + self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim) + self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim) + self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim) + + self.hifi_gan = SeamlessM4THifiGan(config) + + # Initialize weights and apply final processing + self.post_init() + + def _get_dur_output_lengths(self, input_ids, dur_out): + """ + Computes the output length after the duration layer. + """ + unit_lengths = (input_ids != self.pad_token_id).sum(1) + + # take care of edge cases where no padding or too many padding + unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1) + + cumulative_dur_out = torch.cumsum(dur_out, dim=1) + unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze() + + return unit_lengths + + def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]): + """ + Computes the output length of the hifigan convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return ( + torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1 + ) + + def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1): + return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1 + + # conv_pre + input_lengths = _conv_out_length(input_lengths, 7, 1, 3) + + # upsampler + for i, (upsample_rate, kernel_size) in enumerate( + zip(self.config.upsample_rates, self.config.upsample_kernel_sizes) + ): + input_lengths = _transpose_conv_out_length( + input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2 + ) + + # resblock + for i in range(len(self.config.upsample_rates)): + for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes): + for dil in dilation: + input_lengths = _conv_out_length( + input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil + ) + + for dil in dilation: + input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1) + + # conv_post + input_lengths = _conv_out_length(input_lengths, 7, 1, 3) + + return input_lengths + + def forward( + self, input_ids: torch.LongTensor, spkr_id: torch.Tensor, lang_id: torch.Tensor + ) -> tuple[torch.Tensor]: + """ + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4TTextToUnitForConditionalGeneration`]. [What are input + IDs?](../glossary#input-ids) + spkr_id (`int`, *optional*): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + tgt_lang (`str`, *optional*): + The language id to use as target language for translation. + """ + hidden_states = self.unit_embedding(input_ids).transpose(1, 2) + spkr = self.speaker_embedding(spkr_id).transpose(1, 2) + lang = self.language_embedding(lang_id).transpose(1, 2) + + log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2)) + dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1) + # B x C x T + if hidden_states.size(0) == 1: + hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2) + else: + # if batched sample, need to interleave per sample, and pad -> loss of parallelism + if hidden_states.shape[0] > 1 and self.training: + logger.warning( + """`self.training=True` and you use batching. You lose parallelism during the hifigan + forward pass because the samples are interleaved.""" + ) + hidden_states = [ + torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1) + for (hidden_state, duration) in zip(hidden_states, dur_out) + ] + + hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2) + + spkr = spkr.repeat(1, 1, hidden_states.shape[-1]) + lang = lang.repeat(1, 1, hidden_states.shape[-1]) + hidden_states = torch.cat([lang, hidden_states, spkr], dim=1) + + hidden_states = self.hifi_gan(hidden_states) + + unit_lengths = self._get_dur_output_lengths(input_ids, dur_out) + lengths = self._get_output_hifigan_lengths(unit_lengths) + + return hidden_states, lengths + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + def apply_weight_norm(self): + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + weight_norm(self.hifi_gan.conv_pre) + for layer in self.hifi_gan.upsampler: + weight_norm(layer) + for layer in self.hifi_gan.resblocks: + layer.apply_weight_norm() + weight_norm(self.hifi_gan.conv_post) + + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.hifi_gan.conv_pre) + for layer in self.hifi_gan.upsampler: + nn.utils.remove_weight_norm(layer) + for layer in self.hifi_gan.resblocks: + layer.remove_weight_norm() + nn.utils.remove_weight_norm(self.hifi_gan.conv_post) + + +############ WHOLE MODEL related code ################ + + +@auto_docstring( + custom_intro=""" + The text-to-text SeamlessM4T Model transformer which can be used for T2TT. + """ +) +class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"] + main_input_name = "input_ids" + + _tied_weights_keys = [ + "lm_head.weight", + "text_encoder.embed_tokens.weight", + "text_decoder.embed_tokens.weight", + ] + + def __init__(self, config: SeamlessM4TConfig): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + + self.text_encoder = SeamlessM4TEncoder(config, self.shared) + self.text_decoder = SeamlessM4TDecoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.text_encoder + + def get_decoder(self): + return self.text_decoder + + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + def set_input_embeddings(self, value): + self.text_encoder.embed_tokens = value + self.text_decoder.embed_tokens = value + self.shared = value + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def generate( + self, + input_ids=None, + tgt_lang=None, + generation_config=None, + logits_processor=None, + stopping_criteria=None, + prefix_allowed_tokens_fn=None, + synced_gpus=False, + **kwargs, + ): + """ + Generates sequences of token ids. + + + + Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the + model's default generation configuration. You can override any `generation_config` by passing the corresponding + parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Parameters: + input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. `**kwargs` + passed to generate matching the attributes of `generation_config` will override them. If + `generation_config` is not provided, the default will be used, which had the following loading + priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model + configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s + default values, whose documentation should be checked to parameterize generation. + logits_processor (`LogitsProcessorList`, *optional*): + Custom logits processors that complement the default logits processors built from arguments and + generation config. If a logit processor is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + stopping_criteria (`StoppingCriteriaList`, *optional*): + Custom stopping criteria that complement the default stopping criteria built from arguments and a + generation config. If a stopping criteria is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*): + If provided, this function constraints the beam search to allowed tokens only at each step. If not + provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and + `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned + on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful + for constrained generation conditioned on the prefix, as described in [Autoregressive Entity + Retrieval](https://huggingface.co/papers/2010.00904). + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed to avoid deadlocking with + `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). + kwargs (`dict[str, Any]`, *optional*): + Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be + forwarded to the `forward` function of the model. + + Return: + [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` + or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible + [`~utils.ModelOutput`] types are: + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] + """ + # prepare text_decoder_input_ids + text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + if tgt_lang is not None: + batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")) + + if hasattr(self.generation_config, "text_decoder_lang_to_code_id"): + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in + {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}""" + ) + # tgt_lang gets priority over decoder input ids + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + else: + raise ValueError( + """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps + the target language to the right token id. Make sure to load the right generation config.""" + ) + else: + # only a warning, otherwise errors appear in the tests + logger.warning( + """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get + a correct generation, otherwise the generation will probably make no sense.""" + ) + + return super().generate( + input_ids, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + decoder_input_ids=text_decoder_input_ids, + **kwargs, + ) + + +@auto_docstring( + custom_intro=""" + The speech-to-text SeamlessM4T Model transformer which can be used for S2TT. + """ +) +class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"] + main_input_name = "input_features" + + _tied_weights_keys = [ + "lm_head.weight", + "text_decoder.embed_tokens.weight", + ] + + def __init__(self, config: SeamlessM4TConfig): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + self.speech_encoder = SeamlessM4TSpeechEncoder(config) + self.text_decoder = SeamlessM4TDecoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.speech_encoder + + def get_decoder(self): + return self.text_decoder + + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + def set_input_embeddings(self, value): + self.text_decoder.embed_tokens = value + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS) + def forward( + self, + input_features: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.speech_encoder( + input_features=input_features, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_outputs[0].device + ) + encoder_attention_mask = _compute_new_attention_mask( + hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def generate( + self, + input_features=None, + tgt_lang=None, + generation_config=None, + logits_processor=None, + stopping_criteria=None, + prefix_allowed_tokens_fn=None, + synced_gpus=False, + **kwargs, + ): + """ + Generates sequences of token ids. + + + + Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the + model's default generation configuration. You can override any `generation_config` by passing the corresponding + parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Parameters: + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. `**kwargs` + passed to generate matching the attributes of `generation_config` will override them. If + `generation_config` is not provided, the default will be used, which had the following loading + priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model + configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s + default values, whose documentation should be checked to parameterize generation. + logits_processor (`LogitsProcessorList`, *optional*): + Custom logits processors that complement the default logits processors built from arguments and + generation config. If a logit processor is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + stopping_criteria (`StoppingCriteriaList`, *optional*): + Custom stopping criteria that complement the default stopping criteria built from arguments and a + generation config. If a stopping criteria is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*): + If provided, this function constraints the beam search to allowed tokens only at each step. If not + provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and + `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned + on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful + for constrained generation conditioned on the prefix, as described in [Autoregressive Entity + Retrieval](https://huggingface.co/papers/2010.00904). + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed to avoid deadlocking with + `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). + kwargs (`dict[str, Any]`, *optional*): + Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be + forwarded to the `forward` function of the model. + + Return: + [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` + or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible + [`~utils.ModelOutput`] types are: + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] + """ + text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + input_features = input_features if input_features is not None else kwargs.pop("inputs") + if tgt_lang is not None: + inputs = kwargs.get("input_embeds") if input_features is None else input_features + inputs = ( + inputs + if inputs is not None + else kwargs.get("encoder_outputs", {"last_hidden_state": None})["last_hidden_state"] + ) + batch_size = len(inputs) + + if hasattr(self.generation_config, "text_decoder_lang_to_code_id"): + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in + {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}""" + ) + # tgt_lang gets priority over decoder input ids + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + else: + raise ValueError( + """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps + the target language to the right token id. Make sure to load the right generation config.""" + ) + else: + # only a warning, otherwise errors appear in the tests + logger.warning( + """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get + a correct generation, otherwise the generation will probably make no sense.""" + ) + return super().generate( + input_features, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + decoder_input_ids=text_decoder_input_ids, + **kwargs, + ) + + +@auto_docstring( + custom_intro=""" + The text-to-speech SeamlessM4T Model transformer which can be used for T2ST. + """ +) +class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["speech_encoder"] + main_input_name = "input_ids" + + _tied_weights_keys = [ + "lm_head.weight", + "text_encoder.embed_tokens.weight", + "text_decoder.embed_tokens.weight", + ] + + def __init__(self, config: SeamlessM4TConfig): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + + self.text_encoder = SeamlessM4TEncoder(config, self.shared) + self.text_decoder = SeamlessM4TDecoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config) + self.vocoder = SeamlessM4TCodeHifiGan(config) + + def get_encoder(self): + return self.text_encoder + + def get_decoder(self): + return self.text_decoder + + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + def set_input_embeddings(self, value): + self.text_encoder.embed_tokens = value + self.text_decoder.embed_tokens = value + self.shared = value + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This is the same forward method as `SeamlessM4TForTextToText`." + "It doesn't use the text-to-unit model `SeamlessM4TTextToUnitForConditionalGeneration`." + "If you want to generate speech, use the `.generate` method." + ) + encoder_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + return_intermediate_token_ids: Optional[bool] = None, + tgt_lang: Optional[str] = None, + spkr_id: Optional[int] = 0, + **kwargs, + ) -> Union[torch.Tensor, SeamlessM4TGenerationOutput]: + """ + Generates translated audio waveforms. + + + + This method successively calls the `.generate` function of two different sub-models. You can specify keyword + arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments + that will be passed to one of them. + + For example, calling `.generate(input_ids, num_beams=4, speech_do_sample=True)` will successively perform + beam-search decoding on the text model, and multinomial beam-search sampling on the speech model. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + return_intermediate_token_ids (`bool`, *optional*): + If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want + to get translated text alongside the audio. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + spkr_id (`int`, *optional*, defaults to 0): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword + arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + + + Returns: + `Union[SeamlessM4TGenerationOutput, tuple[Tensor]]`: + - If `return_intermediate_token_ids`, returns [`SeamlessM4TGenerationOutput`]. + - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size, + sequence_length)` and `waveform_lengths` which gives the length of each sample. + """ + batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")) + + if tgt_lang is None: + raise ValueError("You must specify a `tgt_lang` to generate translated speech.") + else: + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]: + lang_code_to_id = getattr(self.generation_config, key, None) + if lang_code_to_id is None: + raise ValueError( + f"""This model generation config doesn't have a `{key}` key which maps the target language + to the right token id. Make sure to load the right generation config.""" + ) + elif tgt_lang not in lang_code_to_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. + Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports + more languages for text translation than for speech synthesis.""" + ) + + kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs) + kwargs_text["output_hidden_states"] = True + kwargs_text["return_dict_in_generate"] = True + kwargs_text["output_scores"] = True + + text_decoder_input_ids = kwargs_text.get("decoder_input_ids") + + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + + kwargs_text["decoder_input_ids"] = text_decoder_input_ids + + # first generation + text_generation_output = super().generate(input_ids, **kwargs_text) + sequences = text_generation_output.sequences + + # prepare second generation + num_return_sequences = len(sequences) // batch_size + attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None)) + + encoder_hidden_states = text_generation_output.encoder_hidden_states[-1] + + # take care of num_return_sequences + # take most probable hidden states per batch of return_sequences + # (batch_size*num_return_sequences, ...) -> (batch_size,...) + if num_return_sequences > 1: + idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1) + idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1) + idx_most_probable_sequences_per_batch = ( + idx_most_probable_sequences_per_batch + + torch.arange(batch_size, device=self.device) * num_return_sequences + ) + sequences = sequences[idx_most_probable_sequences_per_batch] + + # get decoder last hidden state - must do a pass through the text decoder + t2u_input_embeds = self.text_decoder( + input_ids=sequences, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + ).last_hidden_state + + pad_token_id = self.generation_config.pad_token_id + + # Compute new attention mask + seq_lens = (sequences != pad_token_id).int().sum(1) + t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens) + kwargs_speech["attention_mask"] = t2u_model_attention_mask + + # Compute t2u decoder_input_ids + t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids") + t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang) + t2u_decoder_input_ids = torch.tensor( + [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device + ) + kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids + # second generation + unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech) + output_unit_ids = unit_ids.detach().clone() + + # get rid of t2u_decoder_input_ids + unit_ids = unit_ids[:, kwargs_speech["decoder_input_ids"].shape[1] :] + # replace eos per pad + unit_ids[unit_ids == self.config.t2u_eos_token_id] = self.config.t2u_pad_token_id + # offset of control symbols + unit_ids = torch.where( + unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset + ) + + vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang) + vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device) + + spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device) + + waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id) + + if return_intermediate_token_ids: + return SeamlessM4TGenerationOutput( + waveform=waveform, + waveform_lengths=waveform_lengths, + sequences=sequences, + unit_sequences=output_unit_ids, + ) + + return waveform, waveform_lengths + + +@auto_docstring( + custom_intro=""" + The speech-to-speech SeamlessM4T Model transformer which can be used for S2ST. + """ +) +class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["text_encoder"] + main_input_name = "input_features" + + _tied_weights_keys = [ + "lm_head.weight", + "text_decoder.embed_tokens.weight", + ] + + def __init__(self, config): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + self.speech_encoder = SeamlessM4TSpeechEncoder(config) + self.text_decoder = SeamlessM4TDecoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config) + self.vocoder = SeamlessM4TCodeHifiGan(config) + + def get_encoder(self): + return self.speech_encoder + + def get_decoder(self): + return self.text_decoder + + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + def set_input_embeddings(self, value): + self.text_decoder.embed_tokens = value + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS) + def forward( + self, + input_features: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This is the same forward method as `SeamlessM4TForSpeechToText`. It doesn't use `self.t2u_model`." + "If you want to generate speech, use the `generate` method." + ) + + encoder_outputs = self.speech_encoder( + input_features=input_features, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_outputs[0].device + ) + encoder_attention_mask = _compute_new_attention_mask( + hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_features: Optional[torch.Tensor] = None, + return_intermediate_token_ids: Optional[bool] = None, + tgt_lang: Optional[str] = None, + spkr_id: Optional[int] = 0, + **kwargs, + ) -> Union[torch.Tensor, SeamlessM4TGenerationOutput]: + """ + Generates translated audio waveforms. + + + + This method successively calls the `.generate` function of two different sub-models. You can specify keyword + arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments + that will be passed to one of them. + + For example, calling `.generate(input_features, num_beams=4, speech_do_sample=True)` will successively perform + beam-search decoding on the text model, and multinomial beam-search sampling on the speech model. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Args: + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + return_intermediate_token_ids (`bool`, *optional*): + If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want + to get translated text alongside the audio. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + spkr_id (`int`, *optional*, defaults to 0): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword + arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + + + Returns: + `Union[SeamlessM4TGenerationOutput, tuple[Tensor]]`: + - If `return_intermediate_token_ids`, returns [`SeamlessM4TGenerationOutput`]. + - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size, + sequence_length)` and `waveform_lengths` which gives the length of each sample. + """ + batch_size = len(input_features) if input_features is not None else len(kwargs.get("inputs_embeds")) + + if tgt_lang is None: + raise ValueError("You must specify a `tgt_lang` to generate translated speech.") + else: + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]: + lang_code_to_id = getattr(self.generation_config, key, None) + if lang_code_to_id is None: + raise ValueError( + f"""This model generation config doesn't have a `{key}` key which maps the target language + to the right token id. Make sure to load the right generation config.""" + ) + elif tgt_lang not in lang_code_to_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. + Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports + more languages for text translation than for speech synthesis.""" + ) + + kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs) + kwargs_text["output_hidden_states"] = True + kwargs_text["return_dict_in_generate"] = True + kwargs_text["output_scores"] = True + + text_decoder_input_ids = kwargs_text.get("decoder_input_ids") + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + + kwargs_text["decoder_input_ids"] = text_decoder_input_ids + + # first generation + text_generation_output = super().generate(input_features, **kwargs_text) + sequences = text_generation_output.sequences + + # prepare second generation + num_return_sequences = len(sequences) // batch_size + attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None)) + + # get last_hidden_state from encoder + encoder_hidden_states = self.speech_encoder(input_features=input_features, attention_mask=attention_mask)[0] + + # input modality = speech so new attention mask for the decoder + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_hidden_states.device + ) + attention_mask = _compute_new_attention_mask( + hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths + ) + + # take care of num_return_sequences + # take most probable hidden states per batch of return_sequences + # (batch_size*num_return_sequences, ...) -> (batch_size,...) + if num_return_sequences > 1: + idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1) + idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1) + idx_most_probable_sequences_per_batch = ( + idx_most_probable_sequences_per_batch + + torch.arange(batch_size, device=self.device) * num_return_sequences + ) + sequences = sequences[idx_most_probable_sequences_per_batch] + + # get decoder last hidden state - must do a pass through the text decoder + t2u_input_embeds = self.text_decoder( + input_ids=sequences, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + ).last_hidden_state + + pad_token_id = self.generation_config.pad_token_id + + # Compute new attention mask + seq_lens = (sequences != pad_token_id).int().sum(1) + t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens) + kwargs_speech["attention_mask"] = t2u_model_attention_mask + + # Compute t2u decoder_input_ids + t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids") + t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang) + t2u_decoder_input_ids = torch.tensor( + [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device + ) + kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids + + # second generation + unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech) + output_unit_ids = unit_ids.detach().clone() + + # get rid of t2u_decoder_input_ids + unit_ids = unit_ids[:, kwargs_speech["decoder_input_ids"].shape[1] :] + # replace eos per pad + unit_ids[unit_ids == self.config.t2u_eos_token_id] = self.config.t2u_pad_token_id + # offset of control symbols + unit_ids = torch.where( + unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset + ) + + vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang) + vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device) + + spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device) + + waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id) + + if return_intermediate_token_ids: + return SeamlessM4TGenerationOutput( + waveform=waveform, + waveform_lengths=waveform_lengths, + sequences=sequences, + unit_sequences=output_unit_ids, + ) + + return waveform, waveform_lengths + + +@auto_docstring( + custom_intro=""" + The original SeamlessM4T Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST). + """ +) +class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin): + _tied_weights_keys = [ + "lm_head.weight", + "text_encoder.embed_tokens.weight", + "text_decoder.embed_tokens.weight", + ] + + def __init__(self, config, current_modality="text"): + r""" + current_modality (`str`, *optional*, defaults to `"text"`): + Default modality. Used to initialize the model. + """ + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + + self.text_encoder = SeamlessM4TEncoder(config, self.shared) + self.speech_encoder = SeamlessM4TSpeechEncoder(config) + self.text_decoder = SeamlessM4TDecoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + self.current_modality = current_modality + if current_modality == "speech": + self.main_input_name = "input_features" + + # these models already call post_init in their initialization + self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config) + self.vocoder = SeamlessM4TCodeHifiGan(config) + + def set_modality(self, modality="text"): + if modality == "text": + self.main_input_name = "input_ids" + self.current_modality = "text" + elif modality == "speech": + self.main_input_name = "input_features" + self.current_modality = "speech" + else: + raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.") + + def get_encoder(self): + if self.current_modality == "text": + return self.text_encoder + else: + return self.speech_encoder + + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + def set_input_embeddings(self, value): + self.text_encoder.embed_tokens = value + self.text_decoder.embed_tokens = value + self.shared = value + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + if input_ids is None and input_features is None and inputs_embeds is None and encoder_outputs is None: + raise ValueError( + "`input_ids`,`input_features`, `inputs_embeds` and `encoder_outputs` are all empty. Make sure at least one of them is not." + ) + elif input_features is not None: + if input_ids is not None: + logger.warning( + "`input_ids` is not `None` but `input_features` has been given." + "`input_features` will be used in priority through the `speech_encoder`. " + "Make sure that `input_features` and `input_ids` are mutually exclusive." + ) + + if inputs_embeds is not None: + logger.warning( + "`inputs_embeds` is not `None` but `input_features` has been given." + "`input_features` will be used in priority through `speech_encoder`. " + "`inputs_embeds` will be ignored." + ) + + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This calls the same method `forward` as `SeamlessM4TForTextToText` and `SeamlessM4TForSpeechToText`" + "depending on the input modality. If you want to generate speech, use the `generate` method." + ) + + self.set_modality("speech") + + encoder_outputs = self.speech_encoder( + input_features=input_features, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + elif input_ids is not None or inputs_embeds is not None: + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This calls the same method `forward` as `SeamlessM4TForTextToText` and `SeamlessM4TForSpeechToText`" + "depending on the input modality. If you want to generate speech, use the `generate` method." + ) + self.set_modality("text") + encoder_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + # input modality = speech so new attention mask + if self.current_modality == "speech" and attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_outputs[0].device + ) + encoder_attention_mask = _compute_new_attention_mask( + hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + input_features: Optional[torch.Tensor] = None, + return_intermediate_token_ids: Optional[bool] = None, + tgt_lang: Optional[str] = None, + spkr_id: Optional[int] = 0, + generate_speech: Optional[bool] = True, + **kwargs, + ) -> Union[torch.Tensor, SeamlessM4TGenerationOutput]: + """ + Generates translated token ids and/or translated audio waveforms. + + + + This method successively calls the `.generate` function of two different sub-models. You can specify keyword + arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments + that will be passed to one of them. + + For example, calling `.generate(input_ids=input_ids, num_beams=4, speech_do_sample=True)` will successively + perform beam-search decoding on the text model, and multinomial beam-search sampling on the speech model. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + return_intermediate_token_ids (`bool`, *optional*): + If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want + to get translated text alongside the audio. + Note that if `generate_speech=False`, this parameter will be ignored and + the text tokens are returned. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + spkr_id (`int`, *optional*, defaults to 0): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + generate_speech (`bool`, *optional*, defaults to `True`): + If `False`, will only returns the text tokens and won't generate speech. + + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword + arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + + Returns: + `Union[SeamlessM4TGenerationOutput, tuple[Tensor], ModelOutput]`: + - If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4TGenerationOutput`]. + - If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of + shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample. + - If `generate_speech=False`, it will returns `ModelOutput`. + """ + if input_ids is None and input_features is None and kwargs.get("inputs_embeds") is None: + raise ValueError( + "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not." + ) + + if generate_speech and tgt_lang is None: + raise ValueError("You must specify a `tgt_lang` to generate translated speech.") + + if tgt_lang is not None: + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]: + lang_code_to_id = getattr(self.generation_config, key, None) + if lang_code_to_id is None: + raise ValueError( + f"""This model generation config doesn't have a `{key}` key which maps the target language + to the right token id. Make sure to load the right generation config.""" + ) + elif tgt_lang not in lang_code_to_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. + Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports + more languages for text translation than for speech synthesis.""" + ) + + batch_size = ( + len(input_features) + if input_features is not None + else (len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))) + ) + + kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs) + kwargs_text["output_hidden_states"] = True + kwargs_text["return_dict_in_generate"] = True + kwargs_text["output_scores"] = True + + text_decoder_input_ids = kwargs_text.get("decoder_input_ids") + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + if tgt_lang is not None: + # tgt_lang gets priority over decoder input ids + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + + kwargs_text["decoder_input_ids"] = text_decoder_input_ids + + # first generation + if input_features is not None: + self.set_modality("speech") + if input_ids is not None: + logger.warning( + "`input_features` and `input_ids` are both non empty. `input_features` will be used in priority " + "through the speech encoder. Make sure `input_features=None` if you want to use the text encoder." + ) + text_generation_output = super().generate(input_features=input_features, **kwargs_text) + else: + self.set_modality("text") + text_generation_output = super().generate(input_ids=input_ids, input_features=None, **kwargs_text) + sequences = text_generation_output.sequences + + if not generate_speech: + return text_generation_output + + # prepare second generation + num_return_sequences = len(sequences) // batch_size + attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None)) + + # get encoder last hidden states + if self.current_modality == "speech": + # get last_hidden_state from encoder - must do a pass through the speech encoder + encoder_hidden_states = self.speech_encoder( + input_features=input_features, attention_mask=attention_mask + ).last_hidden_state + + # input modality = speech so new attention mask for the decoder + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_hidden_states.device + ) + attention_mask = _compute_new_attention_mask( + hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths + ) + else: + encoder_hidden_states = text_generation_output.encoder_hidden_states[-1] + + # take care of num_return_sequences + # take most probable hidden states per batch of return_sequences + # (batch_size*num_return_sequences, ...) -> (batch_size,...) + if num_return_sequences > 1: + idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1) + idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1) + idx_most_probable_sequences_per_batch = ( + idx_most_probable_sequences_per_batch + + torch.arange(batch_size, device=self.device) * num_return_sequences + ) + sequences = sequences[idx_most_probable_sequences_per_batch] + + # get decoder last hidden state - must do a pass through the text decoder + t2u_input_embeds = self.text_decoder( + input_ids=sequences, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + ).last_hidden_state + + pad_token_id = self.generation_config.pad_token_id + + # Compute new attention mask + seq_lens = (sequences != pad_token_id).int().sum(1) + t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens) + kwargs_speech["attention_mask"] = t2u_model_attention_mask + + # Compute t2u decoder_input_ids + t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids") + t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang) + t2u_decoder_input_ids = torch.tensor( + [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device + ) + kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids + + # second generation + unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech) + output_unit_ids = unit_ids.detach().clone() + + # get rid of t2u_decoder_input_ids + unit_ids = unit_ids[:, kwargs_speech["decoder_input_ids"].shape[1] :] + # replace eos per pad + unit_ids[unit_ids == self.config.t2u_eos_token_id] = self.config.t2u_pad_token_id + # offset of control symbols + unit_ids = torch.where( + unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset + ) + + vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang) + vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device) + + spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device) + + waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id) + + if return_intermediate_token_ids: + return SeamlessM4TGenerationOutput( + waveform=waveform, + waveform_lengths=waveform_lengths, + sequences=sequences, + unit_sequences=output_unit_ids, + ) + + return waveform, waveform_lengths + + +__all__ = [ + "SeamlessM4TForTextToSpeech", + "SeamlessM4TForSpeechToSpeech", + "SeamlessM4TForTextToText", + "SeamlessM4TForSpeechToText", + "SeamlessM4TModel", + "SeamlessM4TPreTrainedModel", + "SeamlessM4TCodeHifiGan", + "SeamlessM4THifiGan", + "SeamlessM4TTextToUnitForConditionalGeneration", + "SeamlessM4TTextToUnitModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/processing_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/processing_seamless_m4t.py new file mode 100644 index 0000000000000000000000000000000000000000..85e1968d7d43c006b5495b5729031abfd5026010 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/processing_seamless_m4t.py @@ -0,0 +1,109 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Audio/Text processor class for SeamlessM4T +""" + +from typing import Optional, Union + +from ...audio_utils import AudioInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg + + +logger = logging.get_logger(__name__) + + +class SeamlessM4TTextKwargs(TextKwargs): + src_lang: Optional[str] + tgt_lang: Optional[str] + + +class SeamlessM4TProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: SeamlessM4TTextKwargs + _defaults = {} + + +class SeamlessM4TProcessor(ProcessorMixin): + r""" + Constructs a SeamlessM4T processor which wraps a SeamlessM4T feature extractor and a SeamlessM4T tokenizer into a + single processor. + + [`SeamlessM4TProcessor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and + [`SeamlessM4TTokenizerFast`]. See the [`~SeamlessM4TProcessor.__call__`] and [`~SeamlessM4TProcessor.decode`] for + more information. + + Args: + feature_extractor ([`SeamlessM4TFeatureExtractor`]): + The audio processor is a required input. + tokenizer ([`SeamlessM4TTokenizerFast`]): + The tokenizer is a required input. + """ + + feature_extractor_class = "SeamlessM4TFeatureExtractor" + tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast") + valid_processor_kwargs = SeamlessM4TProcessorKwargs + + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) + + @deprecate_kwarg("audios", version="v4.59.0", new_name="audio") + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + audios: Optional[AudioInput] = None, + audio: Optional[AudioInput] = None, + **kwargs: Unpack[ProcessingKwargs], + ): + """ + Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text` + and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not + `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to + SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer + to the docstring of the above two methods for more information. + + Args: + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + audios (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`): + The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case + of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels, + and T the sample length of the audio. + Returns: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **input_features** -- Audio input features to be fed to a model. Returned when `audios` is not `None`. + """ + if text is not None and audios is not None: + raise ValueError( + "Text and audios are mututally exclusive when passed to `SeamlessM4T`. Specify one or another." + ) + if audio is None and audios is not None: + logger.warning( + "Passing `audios` as keyword argument is deprecated and will be removed in v4.63, please pass `audio` instead." + ) + audio = audios + return super().__call__(text=text, audio=audio, **kwargs) + + +__all__ = ["SeamlessM4TProcessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t.py new file mode 100644 index 0000000000000000000000000000000000000000..fd773316580c1dc3961a954dc2fd3a8e0d4face9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t.py @@ -0,0 +1,567 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for SeamlessM4T.""" + +import os +from shutil import copyfile +from typing import Any, Optional, Union + +import sentencepiece as spm + +from ...convert_slow_tokenizer import import_protobuf +from ...tokenization_utils import ( + BatchEncoding, + PreTokenizedInput, + PreTrainedTokenizer, + TextInput, +) +from ...tokenization_utils_base import AddedToken +from ...utils import PaddingStrategy, logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + + +SPIECE_UNDERLINE = "▁" + + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + + +@requires(backends=("sentencepiece",)) +class SeamlessM4TTokenizer(PreTrainedTokenizer): + """ + Construct a SeamlessM4T tokenizer. + + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + The tokenization method is ` ` for source language documents, and ` ` for target language documents. + + Examples: + + ```python + >>> from transformers import SeamlessM4TTokenizer + + >>> tokenizer = SeamlessM4TTokenizer.from_pretrained( + ... "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra" + ... ) + >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" + >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." + >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt") + ``` + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + tokenizer_file (`str`, *optional*): + The path to a tokenizer file to use instead of the vocab file. + src_lang (`str`, *optional*, defaults to `"eng"`): + The language to use as source language for translation. + tgt_lang (`str`, *optional*, defaults to `"fra"`): + The language to use as target language for translation. + sp_model_kwargs (`dict[str, Any]`, *optional*): + Additional keyword arguments to pass to the model initialization. + additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*): + A tuple or a list of additional special tokens. Can be used to specify the list of languages that will be + supported by the tokenizer. + add_prefix_space (`bool`, *optional*, defaults to `True`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + prefix_tokens: list[int] = [] + suffix_tokens: list[int] = [] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + tokenizer_file=None, + src_lang="eng", + tgt_lang="fra", + sp_model_kwargs: Optional[dict[str, Any]] = None, + additional_special_tokens=None, + add_prefix_space=True, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + # Add this unused argument to keep some important Copied from statements + self.legacy = False + self.vocab_file = vocab_file + + self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False)) + + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | ---- | ---- | ---- | ---- | ---- | ---- + # spm | '' | '' | '' | 'an' | 'en' | '_d' | 'er' | 'in' | '_s' | '_a' + # fairseq | '' | '' | '' | '' | 'an' | 'en' | '▁d' | 'er' | 'in' | '▁s' + + # Mimic fairseq token-to-id alignment for the first 4 token + self._added_tokens_decoder = { + 0: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token, + 1: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token, + 2: AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token, + 3: AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token, + } + + # The first "real" token "an" has position 4 in the original fairseq vocab and position 3 in the spm vocab + self.fairseq_offset = 1 + + self.sp_model_size = len(self.sp_model) + + self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang + self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang + self.add_prefix_space = add_prefix_space + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + tokenizer_file=tokenizer_file, + src_lang=src_lang, + tgt_lang=tgt_lang, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + self.set_src_lang_special_tokens(self._src_lang) + self.set_tgt_lang_special_tokens(self._tgt_lang) + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.__getstate__ + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.__setstate__ + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + @property + def vocab_size(self): + return len(self.sp_model) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] + ] = None, + padding: Union[bool, str, PaddingStrategy] = True, + pad_to_multiple_of: Optional[int] = 2, + src_lang: Optional[str] = None, + tgt_lang: Optional[str] = None, + **kwargs, + ): + """ + Args: + text (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_target (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + src_lang (`str`, *optional*): + A string representing the source language. If not specified, the last `src_lang` specified (either + during initialization or when calling this tokenizer) will be used. + tgt_lang (`str`, *optional*): + A string representing the target language. If not specified, the last `tgt_lang` specified (either + during initialization or when calling this tokenizer) will be used. + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to [`PreTrainedTokenizer.__call__`]. + """ + if src_lang is not None: + self.src_lang = src_lang + if tgt_lang is not None: + self.tgt_lang = tgt_lang + + output = super().__call__( + text=text, + text_pair=text_pair, + text_target=text_target, + text_pair_target=text_pair_target, + padding=padding, + pad_to_multiple_of=pad_to_multiple_of, + **kwargs, + ) + + return BatchEncoding(output, tensor_type=kwargs.get("return_tensors")) + + @property + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang + def src_lang(self) -> str: + return self._src_lang + + @src_lang.setter + def src_lang(self, new_src_lang: str) -> None: + if "__" not in new_src_lang: + self._src_lang = f"__{new_src_lang}__" + else: + self._src_lang = new_src_lang + self.set_src_lang_special_tokens(self._src_lang) + + @property + def tgt_lang(self) -> str: + return self._tgt_lang + + @tgt_lang.setter + def tgt_lang(self, new_tgt_lang: str) -> None: + if "__" not in new_tgt_lang: + self._tgt_lang = f"__{new_tgt_lang}__" + else: + self._tgt_lang = new_tgt_lang + self.set_tgt_lang_special_tokens(self._tgt_lang) + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence: + + - `input_ids` (for encoder) `X [eos, src_lang_code]` + - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def _build_translation_inputs( + self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs + ): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model.") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + if "__" not in tgt_lang: + tgt_lang = f"__{tgt_lang}__" + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs + + def get_vocab(self): + vocab = { + self.convert_ids_to_tokens(i): i for i in range(self.fairseq_offset, self.vocab_size + self.fairseq_offset) + } + vocab.update(self.added_tokens_encoder) + return vocab + + @property + def unk_token_length(self): + return len(self.sp_model.encode(str(self.unk_token))) + + # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor + def get_spm_processor(self, from_slow=False): + tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) + if self.legacy or from_slow: # no dependency on protobuf + tokenizer.Load(self.vocab_file) + return tokenizer + + with open(self.vocab_file, "rb") as f: + sp_model = f.read() + model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)") + model = model_pb2.ModelProto.FromString(sp_model) + normalizer_spec = model_pb2.NormalizerSpec() + normalizer_spec.add_dummy_prefix = False + model.normalizer_spec.MergeFrom(normalizer_spec) + sp_model = model.SerializeToString() + tokenizer.LoadFromSerializedProto(sp_model) + return tokenizer + + # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize + def tokenize(self, text: "TextInput", **kwargs) -> list[str]: + """ + Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the + first token is special. + """ + if self.legacy or len(text) == 0: + return super().tokenize(text, **kwargs) + + text = text.replace(SPIECE_UNDERLINE, " ") + if self.add_prefix_space: + text = SPIECE_UNDERLINE + text + + tokens = super().tokenize(text, **kwargs) + + if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: + tokens = tokens[1:] + return tokens + + # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize + def _tokenize(self, text, **kwargs): + """ + Returns a tokenized string. + + We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any + SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give + `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the + `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. + `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. + """ + if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): + return self.sp_model.encode(text, out_type=str) + + # 1. Encode string + prefix ex: " Hey" + tokens = self.sp_model.encode(self.unk_token + text, out_type=str) + # 2. Remove self.unk_token from ['<','unk','>', '▁Hey'] + return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + spm_id = self.sp_model.PieceToId(token) + + # Need to return unknown token if the SP model returned 0 + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + # since we manually add the prefix space, we have to remove it when decoding + if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space: + tokens[0] = tokens[0][1:] + + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.prepare_seq2seq_batch with eng_Latn->eng, fra_Latn->fra + def prepare_seq2seq_batch( + self, + src_texts: list[str], + src_lang: str = "eng", + tgt_texts: Optional[list[str]] = None, + tgt_lang: str = "fra", + **kwargs, + ) -> BatchEncoding: + self.src_lang = src_lang + self.tgt_lang = tgt_lang + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_input_mode + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_target_mode + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) + + def set_src_lang_special_tokens(self, src_lang) -> None: + """Reset the special tokens to the source lang setting. + Prefix=[src_lang_code], suffix = [eos] + """ + self.cur_lang_code = self.convert_tokens_to_ids(src_lang) + self.init_kwargs["src_lang"] = src_lang + + if self.cur_lang_code == self.unk_token_id: + logger.warning_once( + f"`src_lang={src_lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id." + ) + + self.prefix_tokens = [self.cur_lang_code] + self.suffix_tokens = [self.eos_token_id] + + # https://github.com/facebookresearch/fairseq2/blob/c53f18e6be6b8b46b722f2249b8397b7eccd7ad3/src/fairseq2/models/nllb/tokenizer.py#L112-L116 + def set_tgt_lang_special_tokens(self, lang: str) -> None: + """Reset the special tokens to the target lang setting. + Prefix=[eos, tgt_lang_code] and suffix=[eos]. + """ + self.cur_lang_code = self.convert_tokens_to_ids(lang) + self.init_kwargs["tgt_lang"] = lang + + if self.cur_lang_code == self.unk_token_id: + logger.warning_once( + f"`tgt_lang={lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id." + ) + + self.prefix_tokens = [self.eos_token_id, self.cur_lang_code] + self.suffix_tokens = [self.eos_token_id] + + +__all__ = ["SeamlessM4TTokenizer"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..0318336332c3c9e4ff9dfad1a050cc7b0a3fe47d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py @@ -0,0 +1,446 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization class for SeamlessM4T.""" + +import os +from shutil import copyfile +from typing import Optional, Union + +from tokenizers import processors + +from ...tokenization_utils import ( + BatchEncoding, + PreTokenizedInput, + TextInput, +) +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import PaddingStrategy, is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_seamless_m4t import SeamlessM4TTokenizer +else: + SeamlessM4TTokenizer = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + + +class SeamlessM4TTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" SeamlessM4T tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + The tokenization method is ` ` for source language documents, and ` ` for target language documents. + + Examples: + + ```python + >>> from transformers import SeamlessM4TTokenizerFast + + >>> tokenizer = SeamlessM4TTokenizerFast.from_pretrained( + ... "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra" + ... ) + >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" + >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie." + >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt") + ``` + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + tokenizer_file (`str`, *optional*): + The path to a tokenizer file to use instead of the vocab file. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + src_lang (`str`, *optional*, defaults to `"eng"`): + The language to use as source language for translation. + tgt_lang (`str`, *optional*, defaults to `"fra"`): + The language to use as target language for translation. + additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*): + A tuple or a list of additional special tokens. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = SeamlessM4TTokenizer + model_input_names = ["input_ids", "attention_mask"] + + prefix_tokens: list[int] = [] + suffix_tokens: list[int] = [] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + src_lang="eng", + tgt_lang="fra", + additional_special_tokens=None, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + src_lang=src_lang, + tgt_lang=tgt_lang, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.vocab_file = vocab_file + self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang + self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang + self.set_src_lang_special_tokens(self._src_lang) + self.set_tgt_lang_special_tokens(self._tgt_lang) + + @property + # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang + def src_lang(self) -> str: + return self._src_lang + + @src_lang.setter + def src_lang(self, new_src_lang: str) -> None: + if "__" not in new_src_lang: + self._src_lang = f"__{new_src_lang}__" + else: + self._src_lang = new_src_lang + self.set_src_lang_special_tokens(self._src_lang) + + @property + def tgt_lang(self) -> str: + return self._tgt_lang + + @tgt_lang.setter + def tgt_lang(self, new_tgt_lang: str) -> None: + if "__" not in new_tgt_lang: + self._tgt_lang = f"__{new_tgt_lang}__" + else: + self._tgt_lang = new_tgt_lang + self.set_tgt_lang_special_tokens(self._tgt_lang) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. The special tokens depend on calling set_lang. + + An SeamlessM4T sequence has the following format, where `X` represents the sequence: + + - `input_ids` (for encoder) `[src_lang_code] X [eos]` + - `decoder_input_ids`: (for decoder) `[eos, tgt_lang_code] X [eos]` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def _build_translation_inputs( + self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs + ): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + if "__" not in tgt_lang: + tgt_lang = f"__{tgt_lang}__" + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs + + # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.prepare_seq2seq_batch with "fra_Latn"->"fra", "eng_Latn"->"eng" + def prepare_seq2seq_batch( + self, + src_texts: list[str], + src_lang: str = "eng", + tgt_texts: Optional[list[str]] = None, + tgt_lang: str = "fra", + **kwargs, + ) -> BatchEncoding: + self.src_lang = src_lang + self.tgt_lang = tgt_lang + return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs) + + # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_input_mode + def _switch_to_input_mode(self): + return self.set_src_lang_special_tokens(self.src_lang) + + # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_target_mode + def _switch_to_target_mode(self): + return self.set_tgt_lang_special_tokens(self.tgt_lang) + + def set_src_lang_special_tokens(self, src_lang) -> None: + """Reset the special tokens to the source lang setting. + Prefix=[src_lang_code], suffix = [eos] + """ + self.cur_lang_code = self.convert_tokens_to_ids(src_lang) + + if self.cur_lang_code == self.unk_token_id: + logger.warning_once( + f"`tgt_lang={src_lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id." + ) + + self.init_kwargs["src_lang"] = src_lang + + self.prefix_tokens = [self.cur_lang_code] + self.suffix_tokens = [self.eos_token_id] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) + + def set_tgt_lang_special_tokens(self, lang: str) -> None: + """Reset the special tokens to the target lang setting. + Prefix=[eos, tgt_lang_code] and suffix=[eos]. + """ + self.cur_lang_code = self.convert_tokens_to_ids(lang) + + if self.cur_lang_code == self.unk_token_id: + logger.warning_once( + f"`tgt_lang={lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id." + ) + + self.init_kwargs["tgt_lang"] = lang + + self.prefix_tokens = [self.eos_token_id, self.cur_lang_code] + self.suffix_tokens = [self.eos_token_id] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) + + # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory.") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + @classmethod + def _from_pretrained( + cls, + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + token=None, + cache_dir=None, + local_files_only=False, + _commit_hash=None, + _is_local=False, + **kwargs, + ): + tokenizer = super()._from_pretrained( + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + token=token, + cache_dir=cache_dir, + local_files_only=local_files_only, + _commit_hash=_commit_hash, + _is_local=_is_local, + **kwargs, + ) + + # ensure also set after from pretrained + tokenizer.set_src_lang_special_tokens(tokenizer._src_lang) + tokenizer.set_tgt_lang_special_tokens(tokenizer._tgt_lang) + + return tokenizer + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] + ] = None, + padding: Union[bool, str, PaddingStrategy] = True, + pad_to_multiple_of: Optional[int] = 2, + src_lang: Optional[str] = None, + tgt_lang: Optional[str] = None, + **kwargs, + ): + """ + Args: + text (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_target (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + src_lang (`str`, *optional*): + A string representing the source language. If not specified, the last `src_lang` specified (either + during initialization or when calling this tokenizer) will be used. + tgt_lang (`str`, *optional*): + A string representing the target language. If not specified, the last `tgt_lang` specified (either + during initialization or when calling this tokenizer) will be used. + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to [`PreTrainedTokenizerFast.__call__`]. + """ + if src_lang is not None: + self.src_lang = src_lang + if tgt_lang is not None: + self.tgt_lang = tgt_lang + + output = super().__call__( + text=text, + text_pair=text_pair, + text_target=text_target, + text_pair_target=text_pair_target, + padding=padding, + pad_to_multiple_of=pad_to_multiple_of, + **kwargs, + ) + + return output + + +__all__ = ["SeamlessM4TTokenizerFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..10f256f8575595d6b3409c030e1d6c3b33c75aa9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_seamless_m4t_v2 import * + from .modeling_seamless_m4t_v2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..7ac6c4024a5a6dd3dc5f60472974c771bf27e27e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py @@ -0,0 +1,425 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SeamlessM4Tv2 model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class SeamlessM4Tv2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`~SeamlessM4Tv2Model`]. It is used to instantiate + an SeamlessM4Tv2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the SeamlessM4Tv2 + [""](https://huggingface.co/"") architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 256102): + Vocabulary size of the text modality of the SeamlessM4Tv2 model. Defines the number of different tokens + that can be represented by the `inputs_ids` passed when calling [`~SeamlessM4Tv2Model`], + [`~SeamlessM4Tv2ForTextToSpeech`] or [`~SeamlessM4Tv2ForTextToText`]. + t2u_vocab_size (`int`, *optional*, defaults to 10082): + Unit vocabulary size of the SeamlessM4Tv2 model. Defines the number of different "unit tokens" that can be + represented by the `inputs_ids` passed when calling the Text-To-Units sub-model of [`~SeamlessM4Tv2Model`], + [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`]. + char_vocab_size (`int`, *optional*, defaults to 10943): + Character vocabulary size of the SeamlessM4Tv2 model. Defines the number of different character tokens that + can be represented by the `char_inputs_ids` passed when calling the Text-To-Units sub-model of + [`~SeamlessM4Tv2Model`], [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`]. + + > Parameters shared across sub-models + + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the "intermediate" layers in the architecture. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model text encoder and decoder might ever be used with. Typically set + this to something large just in case (e.g., 512 or 1024 or 2048). + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as an encoder/decoder or not. + encoder_layerdrop (`float`, *optional*, defaults to 0.05): + The LayerDrop probability for the encoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.05): + The LayerDrop probability for the decoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the decoder and feed-forward layers. If string, + `"gelu"`, `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, decoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all attention layers. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all activation layers in the model. + scale_embedding (`bool`, *optional*, defaults to `True`): + Scale embeddings by diving by sqrt(d_model). + + > Text encoder and text decoder specific parameters + + encoder_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer text encoder. + encoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text encoder. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text encoder. + decoder_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer text decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text decoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text decoder. + decoder_start_token_id (`int`, *optional*, defaults to 3): + If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only + applied in the text decoder. + max_new_tokens (`int`, *optional*, defaults to 256): + The maximum numbers of text tokens to generate, ignoring the number of tokens in the prompt. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the _padding_ text token. Only applied to the text-decoder model. + bos_token_id (`int`, *optional*, defaults to 2): + The id of the _beginning-of-stream_ text token. Only applied to the text-decoder model. + eos_token_id (`int`, *optional*, defaults to 3): + The id of the _end-of-stream_ text token. Only applied to the text-decoder model. + + > Speech encoder specific parameters + + speech_encoder_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer speech encoder. + speech_encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer speech encoder. + speech_encoder_intermediate_size (`int`, *optional*, defaults to 4096): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer speech encoder. + speech_encoder_hidden_act (`str` or `function`, *optional*, defaults to `"swish"`): + The non-linear activation function (function or string) in the speech encoder. If string, `"gelu"`, + `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported. + speech_encoder_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all layers in the speech encoder. + add_adapter (`bool`, *optional*, defaults to `True`): + Add an adapter layer on top of the speech encoder. + speech_encoder_layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability for the speech encoder. See the [LayerDrop paper](see + https://huggingface.co/papers/1909.11556) for more details. + feature_projection_input_dim (`int`, *optional*, defaults to 160): + Input dimension of the input feature projection of the speech encoder, i.e the dimension after processing + input audios with [`SeamlessM4TFeatureExtractor`]. + adaptor_kernel_size (`int`, *optional*, defaults to 8): + Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adaptor_stride (`int`, *optional*, defaults to 8): + Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adaptor_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all layers in the speech adapter. + num_adapter_layers (`int`, *optional*, defaults to 1): + Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`): + Can be specified to `relative_key`. If left to `None`, no relative position embedding is applied. Only + applied to the speech encoder. For more information on `"relative_key"`, please refer to [Self-Attention + with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + conv_depthwise_kernel_size (`int`, *optional*, defaults to 31): + Kernel size of convolutional depthwise 1D layer in Conformer blocks. Only applied to the speech encoder. + left_max_position_embeddings (`int`, *optional*, defaults to 64): + The left clipping value for relative positions. + right_max_position_embeddings (`int`, *optional*, defaults to 8): + The right clipping value for relative positions. + speech_encoder_chunk_size (`int`, *optional*, defaults to 20000): The size of each attention chunk. + speech_encoder_left_chunk_num (`int`, *optional*, defaults to 128): + Number of chunks on the left up to which lookahead is allowed. + + > Text-To-Unit (t2u) model specific parameters + + t2u_bos_token_id (`int`, *optional*, defaults to 0): + The id of the _beginning-of-stream_ unit token. Only applied to the text-to-unit seq2seq model. + t2u_pad_token_id (`int`, *optional*, defaults to 1): + The id of the _padding_ unit token. Only applied to the text-to-unit seq2seq model. + t2u_eos_token_id (`int`, *optional*, defaults to 2): + The id of the _end-of-stream_ unit token. Only applied to the text-to-unit seq2seq model. + t2u_encoder_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer text-to-unit encoder. + t2u_encoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit encoder. + t2u_encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text-to-unit encoder. + t2u_decoder_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer text-to-unit decoder. + t2u_decoder_ffn_dim (`int`, *optional*, defaults to 8192): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit decoder. + t2u_decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer text-to-unit decoder. + t2u_max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model text-to-unit component might ever be used with. Typically set + this to something large just in case (e.g., 512 or 1024 or 2048). + t2u_variance_predictor_embed_dim (`int`, *optional*, defaults to 1024): + The projection dimension of the text-to-unit's duration predictor. + t2u_variance_predictor_hidden_dim (`int`, *optional*, defaults to 256): + Internal dimension of the text-to-unit's duration predictor. + t2u_variance_predictor_kernel_size (`int`, *optional*, defaults to 3): + Kernel size of the convolutional layers of the text-to-unit's duration predictor. + t2u_variance_pred_dropout (`float`, *optional*, defaults to 0.5): + The dropout probability of the text-to-unit's duration predictor. + + > Hifi-Gan Vocoder specific parameters + + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the output audio will be generated, expressed in hertz (Hz). + upsample_initial_channel (`int`, *optional*, defaults to 512): + The number of input channels into the hifi-gan upsampling network. Applies to the vocoder only. + upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[5, 4, 4, 2, 2]`): + A tuple of integers defining the stride of each 1D convolutional layer in the vocoder upsampling network. + The length of *upsample_rates* defines the number of convolutional layers and has to match the length of + *upsample_kernel_sizes*. Applies to the vocoder only. + upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[11, 8, 8, 4, 4]`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the vocoder upsampling + network. The length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match + the length of *upsample_rates*. Applies to the vocoder only. + resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`): + A tuple of integers defining the kernel sizes of the vocoder 1D convolutional layers in the multi-receptive + field fusion (MRF) module. Applies to the vocoder only. + resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`): + A nested tuple of integers defining the dilation rates of the vocoder dilated 1D convolutional layers in + the multi-receptive field fusion (MRF) module. Applies to the vocoder only. + leaky_relu_slope (`float`, *optional*, defaults to 0.1): + The angle of the negative slope used by the leaky ReLU activation in the vocoder. Applies to the vocoder + only. + unit_hifi_gan_vocab_size (`int`, *optional*, defaults to 10000): + Vocabulary size of the SeamlessM4Tv2 vocoder. Defines the number of different unit tokens that can be + represented by the `inputs_ids` passed when calling the vocoder of [`~SeamlessM4Tv2Model`], + [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`]. + unit_embed_dim (`int`, *optional*, defaults to 1280): + The projection dimension of the input ids given to the hifi-gan vocoder. Applies to the vocoder only. + lang_embed_dim (`int`, *optional*, defaults to 256): + The projection dimension of the target language given to the hifi-gan vocoder. Applies to the vocoder only. + spkr_embed_dim (`int`, *optional*, defaults to 256): + The projection dimension of the speaker id given to the hifi-gan vocoder. Applies to the vocoder only. + vocoder_num_langs (`int`, *optional*, defaults to 36): + Number of langs supported by the vocoder. Might be different from `t2u_num_langs`. + vocoder_num_spkrs (`int`, *optional*, defaults to 200): + Number of speakers supported by the vocoder. + variance_predictor_kernel_size (`int`, *optional*, defaults to 3): + Kernel size of the duration predictor. Applies to the vocoder only. + var_pred_dropout (`float`, *optional*, defaults to 0.5): + The dropout probability of the duration predictor. Applies to the vocoder only. + vocoder_offset (`int`, *optional*, defaults to 4): + Offset the unit token ids by this number to account for symbol tokens. Applies to the vocoder only. + + ```python + >>> from transformers import SeamlessM4Tv2Model, SeamlessM4Tv2Config + + >>> # Initializing a SeamlessM4Tv2 "" style configuration + >>> configuration = SeamlessM4Tv2Config() + + >>> # Initializing a model from the "" style configuration + >>> model = SeamlessM4Tv2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "seamless_m4t_v2" + + def __init__( + self, + vocab_size=256102, + t2u_vocab_size=10082, + char_vocab_size=10943, + # shared config + hidden_size=1024, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + max_position_embeddings=4096, + is_encoder_decoder=True, + encoder_layerdrop=0.05, + decoder_layerdrop=0.05, + activation_function="relu", + dropout=0.1, + attention_dropout=0.1, + activation_dropout=0.0, + scale_embedding=True, + # text encoder|decoder + encoder_layers=24, + encoder_ffn_dim=8192, + encoder_attention_heads=16, + decoder_layers=24, + decoder_ffn_dim=8192, + decoder_attention_heads=16, + decoder_start_token_id=3, + max_new_tokens=256, + pad_token_id=0, + bos_token_id=2, + eos_token_id=3, + # speech_encoder + speech_encoder_layers=24, + speech_encoder_attention_heads=16, + speech_encoder_intermediate_size=4096, + speech_encoder_hidden_act="swish", + speech_encoder_dropout=0.0, + add_adapter=True, + speech_encoder_layerdrop=0.1, + feature_projection_input_dim=160, + adaptor_kernel_size=8, + adaptor_stride=8, + adaptor_dropout=0.1, + num_adapter_layers=1, + position_embeddings_type="relative_key", + conv_depthwise_kernel_size=31, + left_max_position_embeddings=64, + right_max_position_embeddings=8, + speech_encoder_chunk_size=20000, + speech_encoder_left_chunk_num=128, + # t2u config + t2u_bos_token_id=0, + t2u_pad_token_id=1, + t2u_eos_token_id=2, + t2u_encoder_layers=6, + t2u_encoder_ffn_dim=8192, + t2u_encoder_attention_heads=16, + t2u_decoder_layers=6, + t2u_decoder_ffn_dim=8192, + t2u_decoder_attention_heads=16, + t2u_max_position_embeddings=4096, + t2u_variance_predictor_embed_dim=1024, + t2u_variance_predictor_hidden_dim=256, + t2u_variance_predictor_kernel_size=3, + t2u_variance_pred_dropout=0.5, + # hifi-gan vocoder config + sampling_rate=16000, + upsample_initial_channel=512, + upsample_rates=[5, 4, 4, 2, 2], + upsample_kernel_sizes=[11, 8, 8, 4, 4], + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + leaky_relu_slope=0.1, + # specific to Code Hifi-Gan + unit_hifi_gan_vocab_size=10000, + unit_embed_dim=1280, + lang_embed_dim=256, + spkr_embed_dim=256, + vocoder_num_langs=36, + vocoder_num_spkrs=200, + variance_predictor_kernel_size=3, + var_pred_dropout=0.5, + vocoder_offset=4, + **kwargs, + ): + # overall_config + self.vocab_size = vocab_size + self.t2u_vocab_size = t2u_vocab_size + self.char_vocab_size = char_vocab_size + self.hidden_size = hidden_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.max_position_embeddings = max_position_embeddings + self.use_cache = use_cache + self.max_new_tokens = max_new_tokens + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.activation_function = activation_function + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.scale_embedding = scale_embedding + # for proper config init + self.num_attention_heads = decoder_attention_heads + self.num_hidden_layers = decoder_layers + + # text|unit encoder|decoder + self.encoder_layers = encoder_layers + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_attention_heads = encoder_attention_heads + self.decoder_layers = decoder_layers + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_attention_heads = decoder_attention_heads + + # speech_encoder + self.speech_encoder_layers = speech_encoder_layers + self.speech_encoder_hidden_act = speech_encoder_hidden_act + self.speech_encoder_dropout = speech_encoder_dropout + self.speech_encoder_attention_heads = speech_encoder_attention_heads + self.speech_encoder_layerdrop = speech_encoder_layerdrop + self.speech_encoder_intermediate_size = speech_encoder_intermediate_size + self.feature_projection_input_dim = feature_projection_input_dim + self.adaptor_kernel_size = adaptor_kernel_size + self.adaptor_stride = adaptor_stride + self.adaptor_dropout = adaptor_dropout + self.num_adapter_layers = num_adapter_layers + self.position_embeddings_type = position_embeddings_type + self.conv_depthwise_kernel_size = conv_depthwise_kernel_size + self.add_adapter = add_adapter + self.left_max_position_embeddings = left_max_position_embeddings + self.right_max_position_embeddings = right_max_position_embeddings + self.speech_encoder_chunk_size = speech_encoder_chunk_size + self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num + + # t2u config + self.t2u_bos_token_id = t2u_bos_token_id + self.t2u_pad_token_id = t2u_pad_token_id + self.t2u_eos_token_id = t2u_eos_token_id + self.t2u_encoder_layers = t2u_encoder_layers + self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim + self.t2u_encoder_attention_heads = t2u_encoder_attention_heads + self.t2u_decoder_layers = t2u_decoder_layers + self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim + self.t2u_decoder_attention_heads = t2u_decoder_attention_heads + self.t2u_max_position_embeddings = t2u_max_position_embeddings + self.t2u_variance_predictor_embed_dim = t2u_variance_predictor_embed_dim # TODO: add to docstrings + self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim # TODO: add to docstrings + self.t2u_variance_predictor_kernel_size = t2u_variance_predictor_kernel_size # TODO: add to docstrings + self.t2u_variance_pred_dropout = t2u_variance_pred_dropout # TODO: add to docstrings + + # hifi-gan vocoder config + # original parameters specific to Hifi-Gan + self.sampling_rate = sampling_rate + self.upsample_initial_channel = upsample_initial_channel + self.upsample_rates = upsample_rates + self.upsample_kernel_sizes = upsample_kernel_sizes + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.leaky_relu_slope = leaky_relu_slope + + # specific to Code Hifi-Gan + self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size + self.unit_embed_dim = unit_embed_dim + self.lang_embed_dim = lang_embed_dim + self.spkr_embed_dim = spkr_embed_dim + self.vocoder_num_langs = vocoder_num_langs + self.vocoder_num_spkrs = vocoder_num_spkrs + self.variance_predictor_kernel_size = variance_predictor_kernel_size + self.var_pred_dropout = var_pred_dropout + self.vocoder_offset = vocoder_offset + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + is_encoder_decoder=is_encoder_decoder, + max_position_embeddings=max_position_embeddings, + **kwargs, + ) + + +__all__ = ["SeamlessM4Tv2Config"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..bf883374b68f23e195327b153d2a3855b43820b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -0,0 +1,4398 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch SeamlessM4Tv2 model.""" + +import copy +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import Tensor, nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Wav2Vec2BaseModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_seamless_m4t_v2 import SeamlessM4Tv2Config + + +logger = logging.get_logger(__name__) + +SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS = r""" + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. +""" + + +@dataclass +@auto_docstring( + custom_intro=""" + Class defining the generated outputs from [`SeamlessM4Tv2Model`], [`SeamlessM4Tv2ForTextToText`], + [`SeamlessM4Tv2ForTextToSpeech`], [`SeamlessM4Tv2ForSpeechToSpeech`] and [`SeamlessM4Tv2ForTextToSpeech`]. + """ +) +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TGenerationOutput with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2GenerationOutput(ModelOutput): + r""" + waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + The final audio waveform predicted by the model. + waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*): + The length in samples of each element in the `waveform` batch. + sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + The generated translated sequences. This is the output of the text-to-text or the speech-to-text models. + The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished + early due to the `eos_token_id`. + unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*): + The generated translated unit sequences. This is the output of the text-to-units model. The second + dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished + early due to the `t2u_eos_token_id`. + """ + + waveform: Optional[torch.FloatTensor] = None + waveform_lengths: Optional[torch.IntTensor] = None + sequences: Optional[tuple[torch.FloatTensor]] = None + unit_sequences: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Class defining the outputs from [`SeamlessM4Tv2TextToUnitDecoder`]. + """ +) +class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput): + r""" + padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0 + for *masked* + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + padding_mask: Optional[torch.Tensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and + [`SeamlessM4Tv2TextToUnitModel`]. + """ +) +class SeamlessM4Tv2TextToUnitOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0 + for *masked* + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + padding_mask: Optional[torch.Tensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + loss: Optional[torch.FloatTensor] = None + + +############ UTILS ################ + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +# Copied from transformers.models.bart.modeling_bart.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor): + """ + Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that + stops at the corresponding element in `seq_lens`. + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`): + The sequences to mask, where `*` is any number of sequence-specific dimensions including none. + seq_lens (`torch.Tensor` of shape `(batch)`: + Each element represents the length of the sequence at the same index in `hidden_states` + + Returns: + `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)` + """ + batch_size, mask_seq_len = hidden_states.shape[:2] + + indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1) + + bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len) + + mask = hidden_states.new_ones((batch_size, mask_seq_len)) + + mask = mask.masked_fill(bool_mask, 0) + + return mask + + +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.format_speech_generation_kwargs with SeamlessM4T->SeamlessM4Tv2 +def format_speech_generation_kwargs(kwargs): + """ + Format kwargs for SeamlessM4Tv2 models that generate speech, attribute kwargs to either the text generation or the + speech generation models. + + Args: + kwargs (`dict`)`: + Keyword arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + """ + # attribute kwargs to models + kwargs_text = {} + kwargs_speech = {} + for key, value in kwargs.items(): + if key.startswith("text_"): + key = key[len("text_") :] + kwargs_text[key] = value + elif key.startswith("speech_"): + key = key[len("speech_") :] + kwargs_speech[key] = value + elif key == "generation_config": + kwargs_text[key] = value + else: + # If the key is already in a specific config, then it's been set with a + # submodules specific value and we don't override + if key not in kwargs_text: + kwargs_text[key] = value + if key not in kwargs_speech: + kwargs_speech[key] = value + return kwargs_text, kwargs_speech + + +############ SPEECH ENCODER related code ################ + + +class SeamlessM4Tv2ConformerFeatureProjection(nn.Module): + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeatureProjection.__init__ + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps) + self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size) + self.dropout = nn.Dropout(config.speech_encoder_dropout) + + def forward(self, hidden_states): + # non-projected hidden states are needed for quantization + norm_hidden_states = self.layer_norm(hidden_states.to(self.layer_norm.weight.dtype)) + hidden_states = self.projection(norm_hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeedForward with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2ConformerFeedForward(nn.Module): + def __init__(self, config, act_fn=None, dropout=None): + super().__init__() + dropout = dropout if dropout is not None else config.speech_encoder_dropout + act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act + + self.intermediate_dropout = nn.Dropout(dropout) + self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size) + self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn + + self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size) + self.output_dropout = nn.Dropout(dropout) + + def forward(self, hidden_states): + hidden_states = self.intermediate_dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.intermediate_dropout(hidden_states) + + hidden_states = self.output_dense(hidden_states) + hidden_states = self.output_dropout(hidden_states) + return hidden_states + + +class SeamlessM4Tv2ConformerConvolutionModule(nn.Module): + """Convolution block used in the conformer block. Uses a causal depthwise convolution similar to that + described in Section 2.1 of https://huggingface.co/papers/1609.03499 + """ + + def __init__(self, config): + super().__init__() + if (config.conv_depthwise_kernel_size - 1) % 2 == 1: + raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding") + self.layer_norm = nn.LayerNorm(config.hidden_size) + self.pointwise_conv1 = nn.Conv1d( + config.hidden_size, + 2 * config.hidden_size, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.glu = nn.GLU(dim=1) + self.depthwise_conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + config.conv_depthwise_kernel_size, + stride=1, + padding=0, + groups=config.hidden_size, + bias=False, + ) + self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size) + self.activation = ACT2FN[config.speech_encoder_hidden_act] + self.pointwise_conv2 = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.dropout = nn.Dropout(config.speech_encoder_dropout) + + def forward(self, hidden_states, attention_mask=None): + hidden_states = self.layer_norm(hidden_states) + + # Ensure that we do not leak padded positions in depthwise convolution. + # Put 0 where necessary + if attention_mask is not None: + hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0) + + # exchange the temporal dimension and the feature dimension + hidden_states = hidden_states.transpose(1, 2) + + # GLU mechanism + # => (batch, 2*channel, dim) + hidden_states = self.pointwise_conv1(hidden_states) + # => (batch, channel, dim) + hidden_states = self.glu(hidden_states) + + # Pad the sequence entirely on the left because of causal convolution. + hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0)) + + # 1D Depthwise Conv + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + hidden_states = self.activation(hidden_states) + + hidden_states = self.pointwise_conv2(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class SeamlessM4Tv2ConformerSelfAttention(nn.Module): + """Construct a SeamlessM4Tv2ConformerSelfAttention object. + Can be enhanced with relative position embeddings. + """ + + def __init__(self, config, use_position_embeddings=True): + super().__init__() + + self.head_size = config.hidden_size // config.speech_encoder_attention_heads + self.num_heads = config.speech_encoder_attention_heads + self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None + + self.linear_q = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_k = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_v = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_out = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(p=config.speech_encoder_dropout) + + if self.position_embeddings_type == "relative_key": + self.left_max_position_embeddings = config.left_max_position_embeddings + self.right_max_position_embeddings = config.right_max_position_embeddings + num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1 + self.distance_embedding = nn.Embedding(num_positions, self.head_size) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + # self-attention mechanism + batch_size, sequence_length, hidden_size = hidden_states.size() + + # make sure query/key states can be != value states + query_key_states = hidden_states + value_states = hidden_states + + # project query_key_states and value_states + query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size) + + # => (batch, head, time1, d_k) + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + + attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size) + + if self.position_embeddings_type == "relative_key": + query_length, key_length = query.shape[2], key.shape[2] + + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_r - position_ids_l + distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings) + + positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings) + positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility + + relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) + attn_weights = attn_weights + (relative_position_attn_weights / math.sqrt(self.head_size)) + + # apply attention_mask if necessary + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + # => (batch, head, time1, time2) + attn_weights = torch.softmax(attn_weights, dim=-1) + attn_weights = self.dropout(attn_weights) + + # => (batch, head, time1, d_k) + attn_output = torch.matmul(attn_weights, value) + + # => (batch, time1, hidden_size) + attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size) + attn_output = self.linear_out(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +class SeamlessM4Tv2ConformerEncoderLayer(GradientCheckpointingLayer): + """Conformer block based on https://huggingface.co/papers/2005.08100.""" + + # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4Tv2, attention_dropout->speech_encoder_dropout, torch.nn->nn + def __init__(self, config): + super().__init__() + embed_dim = config.hidden_size + dropout = config.speech_encoder_dropout + + # Feed-forward 1 + self.ffn1_layer_norm = nn.LayerNorm(embed_dim) + self.ffn1 = SeamlessM4Tv2ConformerFeedForward(config) + + # Self-Attention + self.self_attn_layer_norm = nn.LayerNorm(embed_dim) + self.self_attn_dropout = nn.Dropout(dropout) + self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config) + + # Conformer Convolution + self.conv_module = SeamlessM4Tv2ConformerConvolutionModule(config) + + # Feed-forward 2 + self.ffn2_layer_norm = nn.LayerNorm(embed_dim) + self.ffn2 = SeamlessM4Tv2ConformerFeedForward(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + + def forward( + self, + hidden_states, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + conv_attention_mask: Optional[torch.Tensor] = None, + ): + # 1. Feed-Forward 1 layer + residual = hidden_states + hidden_states = self.ffn1_layer_norm(hidden_states) + hidden_states = self.ffn1(hidden_states) + hidden_states = hidden_states * 0.5 + residual + residual = hidden_states + + # 2. Self-Attention layer + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_dropout(hidden_states) + hidden_states = hidden_states + residual + + # 3. Convolutional Layer + residual = hidden_states + hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask) + hidden_states = residual + hidden_states + + # 4. Feed-Forward 2 Layer + residual = hidden_states + hidden_states = self.ffn2_layer_norm(hidden_states) + hidden_states = self.ffn2(hidden_states) + hidden_states = hidden_states * 0.5 + residual + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, attn_weights + + +class SeamlessM4Tv2ConformerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + self.dropout = nn.Dropout(config.speech_encoder_dropout) + self.layers = nn.ModuleList( + [SeamlessM4Tv2ConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)] + ) + + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.gradient_checkpointing = False + + def _apply_chunk_attention(self, attention_mask, hidden_states): + """ + Creates a chunk attention mask. It creates a mask to prevent attention across chunks, ensuring that each + position attends only to positions within its own chunk. If a left chunk overlap is specified + (`speech_encoder_chunk_size` in the configuration), the attention mask is adjusted accordingly to allow each + position to also attends the `speech_encoder_chunk_size - 1` previous chunks. + """ + sequence_len = hidden_states.shape[1] + + chunk_indices = torch.arange(sequence_len, device=hidden_states.device) + chunk_indices = torch.div(chunk_indices, self.config.speech_encoder_chunk_size).long() + + start_indices = torch.full_like(chunk_indices, 0) + if self.config.speech_encoder_left_chunk_num >= 0: + start_indices = (chunk_indices - self.config.speech_encoder_left_chunk_num).clamp_(min=0) + start_indices = start_indices * self.config.speech_encoder_chunk_size + start_indices = start_indices.unsqueeze(1).expand(-1, sequence_len) + + end_indices = ((chunk_indices + 1) * self.config.speech_encoder_chunk_size).clamp_(max=sequence_len) + + end_indices = end_indices.unsqueeze(1).expand(-1, sequence_len) + + indices = torch.arange(sequence_len, device=hidden_states.device).unsqueeze(0).expand(sequence_len, -1) + + chunk_mask = (indices < start_indices) | (indices >= end_indices) + chunk_mask = chunk_mask.unsqueeze(0).unsqueeze(0) + + attention_mask = chunk_mask if attention_mask is None else (attention_mask.bool() | chunk_mask) + attention_mask = attention_mask.to(dtype=hidden_states.dtype) + return attention_mask + + def forward( + self, + hidden_states, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + conv_attention_mask = attention_mask + if attention_mask is not None: + # make sure padded tokens output 0 + hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0) + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + if self.config.speech_encoder_chunk_size is not None: + attention_mask = self._apply_chunk_attention(attention_mask, hidden_states) + + if attention_mask is not None: + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + + hidden_states = self.dropout(hidden_states) + + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.config.speech_encoder_layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + conv_attention_mask=conv_attention_mask, + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapterLayer with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2ConformerAdapterLayer(nn.Module): + def __init__(self, config): + super().__init__() + embed_dim = config.hidden_size + dropout = config.adaptor_dropout + + self.kernel_size = config.adaptor_kernel_size + self.stride = config.adaptor_stride + + # 1. residual convolution + self.residual_layer_norm = nn.LayerNorm(embed_dim) + self.residual_conv = nn.Conv1d( + embed_dim, + 2 * embed_dim, + self.kernel_size, + stride=self.stride, + padding=self.stride // 2, + ) + self.activation = nn.GLU(dim=1) + + # Self-Attention + self.self_attn_layer_norm = nn.LayerNorm(embed_dim) + self.self_attn_conv = nn.Conv1d( + embed_dim, + 2 * embed_dim, + self.kernel_size, + stride=self.stride, + padding=self.stride // 2, + ) + self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config, use_position_embeddings=False) + self.self_attn_dropout = nn.Dropout(dropout) + + # Feed-forward + self.ffn_layer_norm = nn.LayerNorm(embed_dim) + self.ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=dropout) + + def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask): + pad = self.kernel_size // 2 + seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1) + + seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1 + + return seq_lens.floor() + + def forward( + self, + hidden_states, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + residual = self.residual_layer_norm(hidden_states) + + # Apply pooling to the residual to match the sequence length of the + # multi-head attention output. + # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len) + residual = residual.transpose(1, 2) + residual = self.residual_conv(residual) + residual = self.activation(residual) + # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim) + residual = residual.transpose(1, 2) + + hidden_states = self.self_attn_layer_norm(hidden_states) + # Apply pooling before feeding to the multihead-attention layer. + # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len) + hidden_states = hidden_states.transpose(1, 2) + hidden_states = self.self_attn_conv(hidden_states) + hidden_states = self.activation(hidden_states) + # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim) + hidden_states = hidden_states.transpose(1, 2) + + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + hidden_states.device + ) + attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths) + attention_mask = _prepare_4d_attention_mask( + attention_mask, + hidden_states.dtype, + ) + + # The rest of the computation is identical to a vanilla Transformer + # encoder layer. + hidden_states, attn_weights = self.self_attn( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_dropout(hidden_states) + hidden_states = hidden_states + residual + + residual = hidden_states + + hidden_states = self.ffn_layer_norm(hidden_states) + hidden_states = self.ffn(hidden_states) + residual + + return hidden_states + + +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapter with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2ConformerAdapter(nn.Module): + def __init__(self, config): + super().__init__() + + self.layers = nn.ModuleList( + SeamlessM4Tv2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers) + ) + + def forward(self, hidden_states, attention_mask): + # down project hidden_states if necessary + + for layer in self.layers: + hidden_states = layer(hidden_states, attention_mask) + + return hidden_states + + +############ TEXT / UNITS related code ################ + + +# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding with M2M100->SeamlessM4Tv2 +class SeamlessM4Tv2ScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding +class SeamlessM4Tv2SinusoidalPositionalEmbedding(nn.Module): + """This module produces sinusoidal positional embeddings of any length.""" + + def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None): + super().__init__() + self.offset = 2 + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.make_weights(num_positions + self.offset, embedding_dim, padding_idx) + + def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx) + if hasattr(self, "weights"): + # in forward put the weights on the correct dtype and device of the param + emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) + + self.register_buffer("weights", emb_weights, persistent=False) + + @staticmethod + def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): + """ + Build sinusoidal embeddings. + + This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of + "Attention Is All You Need". + """ + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb) + emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) + if embedding_dim % 2 == 1: + # zero pad + emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) + if padding_idx is not None: + emb[padding_idx, :] = 0 + + return emb.to(torch.get_default_dtype()) + + @torch.no_grad() + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values_length: int = 0, + ): + if input_ids is not None: + bsz, seq_len = input_ids.size() + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to( + input_ids.device + ) + else: + bsz, seq_len = inputs_embeds.size()[:-1] + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length) + + # expand embeddings if needed + max_pos = self.padding_idx + 1 + seq_len + past_key_values_length + if max_pos > self.weights.size(0): + self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) + + return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach() + + def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length + + +class SeamlessM4Tv2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4Tv2 + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[SeamlessM4Tv2Config] = None, + layer_idx: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + is_cross_attention = encoder_hidden_states is not None + batch_size, seq_length = hidden_states.shape[:2] + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states = self.k_proj(current_states) + value_states = self.v_proj(current_states) + key_states = key_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + query_states = self.q_proj(hidden_states) + query_states = query_states.reshape(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states * self.scaling + attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + + if attention_mask is not None: + attention_scores = attention_scores + attention_mask + + # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + # attn_output = torch.bmm(attn_probs, value_states) ? + context_states = torch.matmul(attn_weights, value_states) + # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ? + context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1) + attn_output = self.out_proj(context_states) + + return attn_output, attn_weights + + +# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4Tv2,DenseActDense->FeedForwardNetwork, d_model->hidden_size +class SeamlessM4Tv2FeedForwardNetwork(nn.Module): + def __init__(self, config: SeamlessM4Tv2Config, ffn_dim: int): + super().__init__() + self.fc1 = nn.Linear(config.hidden_size, ffn_dim) + self.fc2 = nn.Linear(ffn_dim, config.hidden_size) + self.dropout = nn.Dropout(config.activation_dropout) + self.act = ACT2FN[config.activation_function] + + def forward(self, hidden_states): + hidden_states = self.fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.dropout(hidden_states) + if ( + isinstance(self.fc2.weight, torch.Tensor) + and hidden_states.dtype != self.fc2.weight.dtype + and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8) + ): + hidden_states = hidden_states.to(self.fc2.weight.dtype) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoderLayer with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2EncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: SeamlessM4Tv2Config, encoder_ffn_dim=None, encoder_attention_heads=None): + super().__init__() + encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim + encoder_attention_heads = ( + config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads + ) + + self.embed_dim = config.hidden_size + self.self_attn = SeamlessM4Tv2Attention( + embed_dim=self.embed_dim, + num_heads=encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.attn_dropout = nn.Dropout(config.dropout) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + + self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=encoder_ffn_dim) + + self.ffn_layer_norm = nn.LayerNorm(config.hidden_size) + self.ffn_dropout = nn.Dropout(config.activation_dropout) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: bool = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): + attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very + large negative values. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = self.attn_dropout(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + + hidden_states = self.ffn_layer_norm(hidden_states) + + hidden_states = self.ffn(hidden_states) + hidden_states = self.ffn_dropout(hidden_states) + + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoderLayer with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2DecoderLayer(GradientCheckpointingLayer): + def __init__( + self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None, layer_idx=None + ): + super().__init__() + decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim + decoder_attention_heads = ( + config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads + ) + + self.embed_dim = config.hidden_size + self.self_attn = SeamlessM4Tv2Attention( + embed_dim=self.embed_dim, + num_heads=decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + layer_idx=layer_idx, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.attn_dropout = nn.Dropout(config.dropout) + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.cross_attention = SeamlessM4Tv2Attention( + self.embed_dim, + decoder_attention_heads, + config.attention_dropout, + is_decoder=True, + layer_idx=layer_idx, + ) + self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim) + + self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=decoder_ffn_dim) + + self.ffn_layer_norm = nn.LayerNorm(config.hidden_size) + self.ffn_dropout = nn.Dropout(config.activation_dropout) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): + attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very + large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): + encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by + very large negative values. + past_key_values (`Cache`): + cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = self.attn_dropout(hidden_states) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.cross_attention_layer_norm(hidden_states) + + hidden_states, cross_attn_weights = self.cross_attention( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = self.attn_dropout(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + + hidden_states = self.ffn_layer_norm(hidden_states) + + hidden_states = self.ffn(hidden_states) + hidden_states = self.ffn_dropout(hidden_states) + + hidden_states = residual + hidden_states + + return hidden_states, self_attn_weights, cross_attn_weights + + +class SeamlessM4Tv2TextToUnitDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None): + super().__init__() + decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim + decoder_attention_heads = ( + config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads + ) + self.dropout = config.dropout + self.embed_dim = config.hidden_size + + self.self_attn = SeamlessM4Tv2Attention( + embed_dim=self.embed_dim, + num_heads=decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + + self.conv1 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same") + self.activation_fn = ACT2FN[config.activation_function] + self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same") + + self.conv_layer_norm = nn.LayerNorm(config.hidden_size) + self.conv_dropout = nn.Dropout(self.dropout) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + padding_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): + attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very + large negative values. + padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* + or 0 for *masked* + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Conv + residual = hidden_states + + # Apply padding mask to avoid leaking padded positions in the convolution layer + if padding_mask is not None: + hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0) + hidden_states = self.conv1(hidden_states.transpose(1, 2)).transpose(1, 2) + + if padding_mask is not None: + hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0) + + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.conv2(hidden_states.transpose(1, 2)).transpose(1, 2) + + hidden_states = self.conv_dropout(hidden_states) + hidden_states = residual + hidden_states + hidden_states = self.conv_layer_norm(hidden_states) + + return hidden_states, self_attn_weights + + +############ SUB-MODELS related code ################ + + +@auto_docstring +class SeamlessM4Tv2PreTrainedModel(PreTrainedModel): + config: SeamlessM4Tv2Config + base_model_prefix = "seamless_m4t_v2" + supports_gradient_checkpointing = True + _no_split_modules = [ + "SeamlessM4Tv2EncoderLayer", + "SeamlessM4Tv2DecoderLayer", + "SeamlessM4Tv2ConformerEncoderLayer", + "SeamlessM4Tv2TextToUnitDecoderLayer", + ] + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, SeamlessM4Tv2ConformerSelfAttention): + if hasattr(module, "pos_bias_u"): + nn.init.xavier_uniform_(module.pos_bias_u) + if hasattr(module, "pos_bias_v"): + nn.init.xavier_uniform_(module.pos_bias_v) + elif isinstance(module, SeamlessM4Tv2ConformerFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, SeamlessM4Tv2TextToUnitDecoder): + module.pos_emb_alpha_char.data.fill_(1) + module.pos_emb_alpha.data.fill_(1) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)): + nn.init.kaiming_normal_(module.weight) + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TPreTrainedModel._compute_sub_sample_lengths_from_attention_mask + def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask): + kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride + pad = kernel_size // 2 + seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1) + + seq_lens = ((seq_lens + 2 * pad - kernel_size) / stride) + 1 + + return seq_lens.floor() + + def _indices_to_subwords(self, input_ids): + """ + Returns the corresponding text string for each input id. + """ + if not hasattr(self.generation_config, "id_to_text"): + raise ValueError( + """This model generation config doesn't have a `id_to_text` key which maps + token ids to subwords. Make sure to load the right generation config.""" + ) + batch_size, sequence_len = input_ids.shape + + subwords_batch = [] + for batch_id in range(batch_size): + subwords = [] + for i in range(sequence_len): + subword = self.generation_config.id_to_text.get(str(input_ids[batch_id, i].item())) + subwords.append(str(subword)) + subwords_batch.append(subwords) + return subwords_batch + + def _count_character_length_in_subword( + self, + input_ids, + subwords_batch, + merge_space_with_prev_subword=False, + pad_token_id=0, + unk_token_id=1, + space="▁", + ): + """ + Counts the number of characters per text string associated with the input token id. + + Args: + input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`): + Corresponding text string for each input id. + merge_space_with_prev_subword (`bool`, *optional*, defaults to `False`): + Indicates if the space character is merged with the previous subword. If `False`, it will be merged + with the next subword. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the _padding_ text token. If it is encountered when calculating the length of a subword + sample, the lengths of subsequent subwords will be set to 0. + unk_token_id (`int`, *optional*, defaults to 1): + The id of the _unknown_ text token. Associated to a subword of length 1. + space (`str`, *optional*, defaults to `"▁"`): + The space character. + """ + batch_size, _ = input_ids.shape + + char_count_per_id = input_ids.new_zeros(input_ids.size()) + + subword_lens = input_ids.ne(pad_token_id).sum(1) + + for batch_id in range(batch_size): + # We slice out the tensor till the padding index. + subword_indices = input_ids[batch_id, : subword_lens[batch_id]] + subwords = subwords_batch[batch_id][: subword_lens[batch_id]] + + is_next_start_with_space = [ + len(subwords[i + 1]) > 1 and subwords[i + 1][0] == space if i < len(subwords) - 1 else False + for i in range(len(subwords)) + ] + is_punc = [ + len(subwords[i]) == 1 + and not subwords[i].isalpha() + and not subwords[i].isnumeric() + and subwords[i] != space + for i in range(len(subwords)) + ] + for i, (subword_idx, subword) in enumerate(zip(subword_indices, subwords)): + if subword_idx == pad_token_id: + break + + if subword_idx == unk_token_id: + # We set char_len to 1 for an unk token. + char_len = 1 + + if merge_space_with_prev_subword and is_next_start_with_space[i]: + char_len += 1 + else: + # By default, spaces are merged with the next subword. + # char_len includes the space. + char_len = len(subword) + + if merge_space_with_prev_subword: + # Add the space for the next subword. + if is_next_start_with_space[i]: + char_len += 1 + # Subtract the space for the current subword. + if i > 0 and is_next_start_with_space[i - 1]: + char_len -= 1 + else: + # Merge space with punctuation mark by default. + if is_punc[i] and is_next_start_with_space[i]: + char_len += 1 + # Subtract the space for the subword succeeding the punctuation mark. + elif i > 0 and is_punc[i - 1] and is_next_start_with_space[i - 1]: + char_len -= 1 + + char_count_per_id[batch_id, i] = char_len + + return char_count_per_id + + def _get_char_input_ids(self, input_ids, subwords_batch, char_count_per_id, pad_token_id=0, unk_token_id=1): + """ + Returns the corresponding character input id for each character of `subwords_batch`. + + Args: + input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`): + Corresponding text string for each input id. + char_count_per_id (`torch.Tensor` of shape `(batch_size, sequence_length)`): + Number of characters per input id. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the _padding_ text token. If it is encountered when calculating the length of a subword + sample, the lengths of subsequent subwords will be set to 0. + unk_token_id (`int`, *optional*, defaults to 1): + The id of the _unknown_ text token. Associated to a subword of length 1. + Returns: + `torch.Tensor`: Tensor of shape `(batch_size, char_sequence_length)` containing the id of each character. + """ + if not hasattr(self.generation_config, "char_to_id"): + raise ValueError( + """This model generation config doesn't have a `char_to_id` key which maps + characters to character ids. Make sure to load the right generation config.""" + ) + + batch_size = input_ids.shape[0] + max_len = int(char_count_per_id.sum(1).max().item()) + + char_seqs = input_ids.new_zeros((batch_size, max_len)).fill_(pad_token_id) + + subword_lens = input_ids.ne(pad_token_id).sum(1) + + for batch_id in range(batch_size): + total = 0 + subword_indices = input_ids[batch_id, : subword_lens[batch_id]] + subwords = subwords_batch[batch_id][: subword_lens[batch_id]] + for subword_idx, subword in zip(subword_indices, subwords): + if subword_idx == unk_token_id: + char_ids = [unk_token_id] + else: + # Get char token indices corresponding to the subwords. + char_ids = [self.generation_config.char_to_id.get(ch, unk_token_id) for ch in list(subword)] + char_seq_len = len(char_ids) + char_seqs[batch_id, total : total + char_seq_len] = torch.tensor(char_ids).to(char_seqs) + total += char_seq_len + return char_seqs + + def _hard_upsample(self, hidden_states, durations): + """ + Repeats the time dimension of each sample in the batch based on the corresponding duration. + + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, *)`, *optional*): + The sequence to repeat, where `*` is any number of sequence-specific dimensions including none. + durations (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indicates how many times to repeat time segments. + """ + if hidden_states.size(0) == 1: + hidden_states = torch.repeat_interleave(hidden_states, durations.view(-1), dim=1) + else: + # if batched sample, need to interleave per sample, and pad -> loss of parallelism + if hidden_states.shape[0] > 1 and self.training: + logger.warning_once( + """`self.training=True` and you use batching. You lose parallelism during the hifigan + forward pass because the samples are interleaved.""" + ) + hidden_states = [ + torch.repeat_interleave(hidden_state, duration, dim=0) + for (hidden_state, duration) in zip(hidden_states, durations) + ] + + hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True) + + return hidden_states + + +@auto_docstring( + custom_intro=""" + Transformer speech encoder consisting of *config.speech_encoder_layers* conformer self attention layers. + Each layer is a [`SeamlessM4Tv2ConformerEncoderLayer`]. + """ +) +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TSpeechEncoder with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2SpeechEncoder(SeamlessM4Tv2PreTrainedModel): + main_input_name = "input_features" + + def __init__(self, config: SeamlessM4Tv2Config): + super().__init__(config) + + self.feature_projection = SeamlessM4Tv2ConformerFeatureProjection(config) + self.encoder = SeamlessM4Tv2ConformerEncoder(config) + self.intermediate_ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=0.0) + self.adapter = SeamlessM4Tv2ConformerAdapter(config) if config.add_adapter else None + self.inner_layer_norm = nn.LayerNorm(config.hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_features: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, Wav2Vec2BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_features is None: + raise ValueError( + """Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4Tv2SpeechEncoder.forward`. + Make sure one of them is not `None`.""" + ) + + hidden_states = self.feature_projection(input_features) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + expanded_hidden_states = self.intermediate_ffn(hidden_states) + hidden_states = hidden_states + 0.5 * expanded_hidden_states + + if self.adapter is not None: + hidden_states = self.adapter(hidden_states, attention_mask=attention_mask) + + hidden_states = self.inner_layer_norm(hidden_states) + + if not return_dict: + return (hidden_states,) + encoder_outputs[1:] + + return Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +# inspired from MBart and NllbMoe +@auto_docstring( + custom_intro=""" + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4Tv2EncoderLayer`]. + """ +) +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoder with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel): + def __init__( + self, + config: SeamlessM4Tv2Config, + embed_tokens: Optional[nn.Embedding] = None, + is_t2u_encoder: bool = False, + ): + r""" + embed_tokens (`nn.Embedding`, *optional*): + Input embedding + is_t2u_encoder (`bool`, *optional*, defaults to `False`): + indicates if it belongs to the text-to-units model, in which case it won't have input embeddings + """ + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + embed_dim = config.hidden_size + + self.is_t2u_encoder = is_t2u_encoder + self.max_source_positions = config.max_position_embeddings + + if not self.is_t2u_encoder: + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding( + self.max_source_positions, + embed_dim, + self.padding_idx, + ) + + layers = [] + for _ in range(config.encoder_layers): + layers.append( + SeamlessM4Tv2EncoderLayer( + config, + encoder_attention_heads=config.encoder_attention_heads, + encoder_ffn_dim=config.encoder_ffn_dim, + ) + ) + + self.layers = nn.ModuleList(layers) + + self.layer_norm = nn.LayerNorm(config.hidden_size) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, BaseModelOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and self.is_t2u_encoder: + raise ValueError( + "You cannot pass input_ids to the encoder of the text_to_units model. Pass inputs_embeds instead." + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.shape + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if not self.is_t2u_encoder: + embed_pos = self.embed_positions(input) + + hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device) + else: + hidden_states = inputs_embeds + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +@auto_docstring( + custom_intro=""" + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`]. + """ +) +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoder with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel): + def __init__( + self, + config: SeamlessM4Tv2Config, + embed_tokens: Optional[nn.Embedding] = None, + ): + r""" + embed_tokens (`nn.Embedding`, *optional*): + Input embedding + """ + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + # if embed_tokens defined, use its shape instead + self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding( + embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx, embed_scale=embed_scale + ) + self.embed_tokens.weight = embed_tokens.weight + else: + self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding( + self.vocab_size, config.hidden_size, self.padding_idx, embed_scale=embed_scale + ) + + self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding( + self.max_target_positions, + config.hidden_size, + padding_idx=self.padding_idx, + ) + + layers = [] + for i in range(config.decoder_layers): + layers.append( + SeamlessM4Tv2DecoderLayer( + config, + decoder_attention_heads=config.decoder_attention_heads, + decoder_ffn_dim=config.decoder_ffn_dim, + layer_idx=i, + ) + ) + self.layers = nn.ModuleList(layers) + self.layer_norm = nn.LayerNorm(config.hidden_size) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." + ) + use_cache = False + + # initialize `past_key_values` + if use_cache and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input, past_key_values_length=past_key_values_length) + + hidden_states = inputs_embeds + positions.to(inputs_embeds.device) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + attention_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`]. + """ +) +class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel): + def __init__( + self, + config: SeamlessM4Tv2Config, + embed_tokens: Optional[nn.Embedding] = None, + ): + r""" + embed_tokens (`nn.Embedding`, *optional*): + Input embedding + """ + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.max_target_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + # if embed_tokens defined, use its shape instead + self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx) + self.embed_tokens.weight = embed_tokens.weight + else: + self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx) + + self.embed_char = nn.Embedding(config.char_vocab_size, config.hidden_size) + self.embed_char_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding( + self.max_target_positions, + config.hidden_size, + padding_idx=self.padding_idx, + ) + + self.pos_emb_alpha_char = nn.Parameter(torch.ones(1)) + self.pos_emb_alpha = nn.Parameter(torch.ones(1)) + self.duration_predictor = SeamlessM4Tv2VariancePredictor( + config.variance_predictor_embed_dim, + config.variance_predictor_hidden_dim, + config.variance_predictor_kernel_size, + config.variance_pred_dropout, + ) + + self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding( + self.max_target_positions, + config.hidden_size, + padding_idx=self.padding_idx, + ) + + layers = [] + for _ in range(config.decoder_layers): + layers.append( + SeamlessM4Tv2TextToUnitDecoderLayer( + config, + decoder_attention_heads=config.decoder_attention_heads, + decoder_ffn_dim=config.decoder_ffn_dim, + ) + ) + self.layers = nn.ModuleList(layers) + self.layer_norm = nn.LayerNorm(config.hidden_size) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + char_input_ids: Optional[torch.LongTensor] = None, + char_count_per_id: Optional[torch.LongTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SeamlessM4Tv2TextToUnitDecoderOutput]: + r""" + Args: + char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`): + Character indices. The correspondence between characters and indices can be found in `char_to_id`, a + dictionary in the generation configuration. + char_count_per_id (`torch.Tensor` of shape `(batch_size, encoder_sequence_length)`): + Number of characters per text input id. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # create padding mask for character lengths + char_padding_mask = _compute_new_attention_mask(char_input_ids, char_count_per_id.sum(1)) + + # upsample hidden states according to characters sequence lengths + char_hidden_states = self._hard_upsample(encoder_hidden_states, char_count_per_id) + # embed char positions + char_positions = self.pos_emb_alpha_char * self.embed_char_positions(inputs_embeds=char_hidden_states) + # update char hidden states with positions and char embeddings + char_hidden_states = self.embed_char(char_input_ids) * self.embed_scale + char_positions + char_hidden_states + + # predict duration + log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask) + dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1) + dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0) + + # upsample char hidden states according to predicted duration + char_hidden_states = self._hard_upsample(char_hidden_states, dur_out) + + positions = self.pos_emb_alpha * self.embed_positions(inputs_embeds=char_hidden_states) + hidden_states = char_hidden_states + positions + + padding_mask = _compute_new_attention_mask(hidden_states, dur_out.sum(1)) + attention_mask = _prepare_4d_attention_mask(padding_mask, hidden_states.dtype) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + padding_mask=padding_mask, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attns, padding_mask] if v is not None) + return SeamlessM4Tv2TextToUnitDecoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + padding_mask=padding_mask, + ) + + +@auto_docstring( + custom_intro=""" + Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4Tv2Encoder`] without embeddings and the decoder is a [`SeamlessM4Tv2TextToUnitDecoder`]. + """ +) +class SeamlessM4Tv2TextToUnitModel(SeamlessM4Tv2PreTrainedModel): + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitModel.__init__ with SeamlessM4T->SeamlessM4Tv2, Decoder->TextToUnitDecoder + def __init__( + self, + config: SeamlessM4Tv2Config, + embed_tokens_decoder: Optional[nn.Embedding] = None, + ): + r""" + embed_tokens_decoder (`nn.Embedding`, *optional*): + input embedding of the decoder. + """ + super().__init__(config) + + self.encoder = SeamlessM4Tv2Encoder(config, is_t2u_encoder=True) + self.decoder = SeamlessM4Tv2TextToUnitDecoder(config, embed_tokens_decoder) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + char_input_ids: Optional[torch.LongTensor] = None, + char_count_per_id: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn, padding_mask) + decoder_outputs = self.decoder( + char_input_ids=char_input_ids, + char_count_per_id=char_count_per_id, + encoder_hidden_states=encoder_outputs[0], + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return SeamlessM4Tv2TextToUnitOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + padding_mask=decoder_outputs.padding_mask, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4Tv2TextToUnitModel`]. + """ +) +class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = [ + "vocoder", + "speech_encoder", + "text_encoder", + "text_decoder", + ] + _tied_weights_keys = ["lm_head.weight"] + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.__init__ with SeamlessM4T->SeamlessM4Tv2 + def __init__( + self, + config: SeamlessM4Tv2Config, + embed_tokens_decoder: Optional[nn.Embedding] = None, + ): + r""" + embed_tokens_decoder (`nn.Embedding`, *optional*): + input embedding of the decoder. + """ + # update config - used principality for bos_token_id etc. + config = copy.deepcopy(config) + for param, val in config.to_dict().items(): + if param.startswith("t2u_"): + config.__setattr__(param[4:], val) + super().__init__(config) + + self.model = SeamlessM4Tv2TextToUnitModel(config, embed_tokens_decoder) + + self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_encoder + def get_encoder(self): + return self.model.encoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_decoder + def get_decoder(self): + return self.model.decoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_input_embeddings + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_input_embeddings + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + char_input_ids: Optional[torch.LongTensor] = None, + char_count_per_id: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`): + Character indices. The correspondence between characters and indices can be found in `char_to_id`, a + dictionary in the generation configuration. + char_count_per_id (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Number of characters per input id. + inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + char_input_ids=char_input_ids, + char_count_per_id=char_count_per_id, + attention_mask=attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + lm_logits = self.lm_head(outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return SeamlessM4Tv2TextToUnitOutput( + last_hidden_state=lm_logits, + padding_mask=outputs.padding_mask, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + loss=masked_lm_loss, + ) + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration._tie_weights + def _tie_weights(self) -> None: + if getattr(self.config, "tie_word_embeddings", True): + output_embeddings = self.get_output_embeddings() + if output_embeddings is not None: + self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) + + +############ VOCODER related code ################ + + +# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock +class HifiGanResidualBlock(nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1): + super().__init__() + self.leaky_relu_slope = leaky_relu_slope + + self.convs1 = nn.ModuleList( + [ + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=dilation[i], + padding=self.get_padding(kernel_size, dilation[i]), + ) + for i in range(len(dilation)) + ] + ) + self.convs2 = nn.ModuleList( + [ + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=1, + padding=self.get_padding(kernel_size, 1), + ) + for _ in range(len(dilation)) + ] + ) + + def get_padding(self, kernel_size, dilation=1): + return (kernel_size * dilation - dilation) // 2 + + def apply_weight_norm(self): + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + for layer in self.convs1: + weight_norm(layer) + for layer in self.convs2: + weight_norm(layer) + + def remove_weight_norm(self): + for layer in self.convs1: + nn.utils.remove_weight_norm(layer) + for layer in self.convs2: + nn.utils.remove_weight_norm(layer) + + def forward(self, hidden_states): + for conv1, conv2 in zip(self.convs1, self.convs2): + residual = hidden_states + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = conv1(hidden_states) + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = conv2(hidden_states) + hidden_states = hidden_states + residual + return hidden_states + + +class SeamlessM4Tv2VariancePredictor(nn.Module): + def __init__(self, embed_dim, hidden_dim, kernel_size, var_pred_dropout): + super().__init__() + + self.conv1 = nn.Conv1d( + embed_dim, + hidden_dim, + kernel_size=kernel_size, + padding="same", + ) + self.activation_function = nn.ReLU() + self.ln1 = nn.LayerNorm(hidden_dim) + self.dropout_module = nn.Dropout(p=var_pred_dropout) + self.conv2 = nn.Conv1d( + hidden_dim, + hidden_dim, + kernel_size=kernel_size, + padding="same", + ) + self.ln2 = nn.LayerNorm(hidden_dim) + self.proj = nn.Linear(hidden_dim, 1) + + def forward(self, hidden_states: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor: + # Input: B x T x C; Output: B x T + if padding_mask is not None: + hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0) + hidden_states = self.conv1(hidden_states.transpose(1, 2)) + hidden_states = self.activation_function(hidden_states).transpose(1, 2) + hidden_states = self.dropout_module(self.ln1(hidden_states)) + if padding_mask is not None: + hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0) + hidden_states = self.conv2(hidden_states.transpose(1, 2)) + hidden_states = self.activation_function(hidden_states).transpose(1, 2) + hidden_states = self.dropout_module(self.ln2(hidden_states)) + return self.proj(hidden_states).squeeze(dim=2) + + +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4THifiGan with SeamlessM4T->SeamlessM4Tv2 +class SeamlessM4Tv2HifiGan(nn.Module): + def __init__(self, config: SeamlessM4Tv2Config): + super().__init__() + model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim + self.leaky_relu_slope = config.leaky_relu_slope + self.num_kernels = len(config.resblock_kernel_sizes) + self.num_upsamples = len(config.upsample_rates) + self.conv_pre = nn.Conv1d( + model_in_dim, + config.upsample_initial_channel, + kernel_size=7, + stride=1, + padding=3, + ) + + self.upsampler = nn.ModuleList() + for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)): + self.upsampler.append( + nn.ConvTranspose1d( + config.upsample_initial_channel // (2**i), + config.upsample_initial_channel // (2 ** (i + 1)), + kernel_size=kernel_size, + stride=upsample_rate, + padding=(kernel_size - upsample_rate) // 2, + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.upsampler)): + channels = config.upsample_initial_channel // (2 ** (i + 1)) + for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes): + self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope)) + + self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3) + + def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor: + r""" + Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch + of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech + waveform. + + Args: + spectrogram (`torch.FloatTensor`): + Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length, + model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim` + is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`. + + Returns: + `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of + shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`. + """ + + hidden_states = self.conv_pre(input_embeds) + for i in range(self.num_upsamples): + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = self.upsampler[i](hidden_states) + + res_state = self.resblocks[i * self.num_kernels](hidden_states) + for j in range(1, self.num_kernels): + res_state += self.resblocks[i * self.num_kernels + j](hidden_states) + hidden_states = res_state / self.num_kernels + + hidden_states = nn.functional.leaky_relu(hidden_states) + hidden_states = self.conv_post(hidden_states) + hidden_states = torch.tanh(hidden_states) + + # remove seq-len dim since this collapses to 1 + waveform = hidden_states.squeeze(1) + + return waveform + + +@auto_docstring( + custom_intro=""" + Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis). + """ +) +class SeamlessM4Tv2CodeHifiGan(PreTrainedModel): + config: SeamlessM4Tv2Config + main_input_name = "input_embeds" + _no_split_modules = [] + + def __init__(self, config): + super().__init__(config) + + self.pad_token_id = config.t2u_pad_token_id + embed_dim = config.unit_embed_dim + kernel_size = config.variance_predictor_kernel_size + var_pred_dropout = config.var_pred_dropout + self.dur_predictor = SeamlessM4Tv2VariancePredictor(embed_dim, embed_dim, kernel_size, var_pred_dropout) + + self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim) + self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim) + self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim) + + self.hifi_gan = SeamlessM4Tv2HifiGan(config) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_dur_output_lengths + def _get_dur_output_lengths(self, input_ids, dur_out): + """ + Computes the output length after the duration layer. + """ + unit_lengths = (input_ids != self.pad_token_id).sum(1) + + # take care of edge cases where no padding or too many padding + unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1) + + cumulative_dur_out = torch.cumsum(dur_out, dim=1) + unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze() + + return unit_lengths + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_output_hifigan_lengths + def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]): + """ + Computes the output length of the hifigan convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return ( + torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1 + ) + + def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1): + return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1 + + # conv_pre + input_lengths = _conv_out_length(input_lengths, 7, 1, 3) + + # upsampler + for i, (upsample_rate, kernel_size) in enumerate( + zip(self.config.upsample_rates, self.config.upsample_kernel_sizes) + ): + input_lengths = _transpose_conv_out_length( + input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2 + ) + + # resblock + for i in range(len(self.config.upsample_rates)): + for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes): + for dil in dilation: + input_lengths = _conv_out_length( + input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil + ) + + for dil in dilation: + input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1) + + # conv_post + input_lengths = _conv_out_length(input_lengths, 7, 1, 3) + + return input_lengths + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.forward with SeamlessM4T->SeamlessM4Tv2, spkr_id->speaker_id + def forward( + self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor + ) -> tuple[torch.Tensor]: + """ + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4Tv2TextToUnitForConditionalGeneration`]. [What are input + IDs?](../glossary#input-ids) + speaker_id (`int`, *optional*): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + tgt_lang (`str`, *optional*): + The language id to use as target language for translation. + """ + hidden_states = self.unit_embedding(input_ids).transpose(1, 2) + spkr = self.speaker_embedding(speaker_id).transpose(1, 2) + lang = self.language_embedding(lang_id).transpose(1, 2) + + log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2)) + dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1) + # B x C x T + if hidden_states.size(0) == 1: + hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2) + else: + # if batched sample, need to interleave per sample, and pad -> loss of parallelism + if hidden_states.shape[0] > 1 and self.training: + logger.warning( + """`self.training=True` and you use batching. You lose parallelism during the hifigan + forward pass because the samples are interleaved.""" + ) + hidden_states = [ + torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1) + for (hidden_state, duration) in zip(hidden_states, dur_out) + ] + + hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2) + + spkr = spkr.repeat(1, 1, hidden_states.shape[-1]) + lang = lang.repeat(1, 1, hidden_states.shape[-1]) + hidden_states = torch.cat([lang, hidden_states, spkr], dim=1) + + hidden_states = self.hifi_gan(hidden_states) + + unit_lengths = self._get_dur_output_lengths(input_ids, dur_out) + lengths = self._get_output_hifigan_lengths(unit_lengths) + + return hidden_states, lengths + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._init_weights + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.apply_weight_norm + def apply_weight_norm(self): + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + weight_norm(self.hifi_gan.conv_pre) + for layer in self.hifi_gan.upsampler: + weight_norm(layer) + for layer in self.hifi_gan.resblocks: + layer.apply_weight_norm() + weight_norm(self.hifi_gan.conv_post) + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.remove_weight_norm + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.hifi_gan.conv_pre) + for layer in self.hifi_gan.upsampler: + nn.utils.remove_weight_norm(layer) + for layer in self.hifi_gan.resblocks: + layer.remove_weight_norm() + nn.utils.remove_weight_norm(self.hifi_gan.conv_post) + + +############ WHOLE MODEL related code ################ + + +@auto_docstring( + custom_intro=""" + The text-to-text SeamlessM4Tv2 Model transformer which can be used for T2TT. + """ +) +# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToText with SeamlessM4T->SeamlessM4Tv2,SeamlessM4Tv2Tokenizer->SeamlessM4TTokenizer, SeamlessM4Tv2Processor->SeamlessM4TProcessor, SEAMLESS_M4T->SEAMLESS_M4T_V2 +class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"] + main_input_name = "input_ids" + + _tied_weights_keys = [ + "lm_head.weight", + "text_encoder.embed_tokens.weight", + "text_decoder.embed_tokens.weight", + ] + + def __init__(self, config: SeamlessM4Tv2Config): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + + self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared) + self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.text_encoder + + def get_decoder(self): + return self.text_decoder + + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + def set_input_embeddings(self, value): + self.text_encoder.embed_tokens = value + self.text_decoder.embed_tokens = value + self.shared = value + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def generate( + self, + input_ids=None, + tgt_lang=None, + generation_config=None, + logits_processor=None, + stopping_criteria=None, + prefix_allowed_tokens_fn=None, + synced_gpus=False, + **kwargs, + ): + """ + Generates sequences of token ids. + + + + Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the + model's default generation configuration. You can override any `generation_config` by passing the corresponding + parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Parameters: + input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. `**kwargs` + passed to generate matching the attributes of `generation_config` will override them. If + `generation_config` is not provided, the default will be used, which had the following loading + priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model + configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s + default values, whose documentation should be checked to parameterize generation. + logits_processor (`LogitsProcessorList`, *optional*): + Custom logits processors that complement the default logits processors built from arguments and + generation config. If a logit processor is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + stopping_criteria (`StoppingCriteriaList`, *optional*): + Custom stopping criteria that complement the default stopping criteria built from arguments and a + generation config. If a stopping criteria is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*): + If provided, this function constraints the beam search to allowed tokens only at each step. If not + provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and + `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned + on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful + for constrained generation conditioned on the prefix, as described in [Autoregressive Entity + Retrieval](https://huggingface.co/papers/2010.00904). + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed to avoid deadlocking with + `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). + kwargs (`dict[str, Any]`, *optional*): + Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be + forwarded to the `forward` function of the model. + + Return: + [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` + or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible + [`~utils.ModelOutput`] types are: + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] + """ + # prepare text_decoder_input_ids + text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + if tgt_lang is not None: + batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")) + + if hasattr(self.generation_config, "text_decoder_lang_to_code_id"): + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in + {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}""" + ) + # tgt_lang gets priority over decoder input ids + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + else: + raise ValueError( + """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps + the target language to the right token id. Make sure to load the right generation config.""" + ) + else: + # only a warning, otherwise errors appear in the tests + logger.warning( + """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get + a correct generation, otherwise the generation will probably make no sense.""" + ) + + return super().generate( + input_ids, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + decoder_input_ids=text_decoder_input_ids, + **kwargs, + ) + + +@auto_docstring( + custom_intro=""" + The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT. + """ +) +class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"] + main_input_name = "input_features" + + _tied_weights_keys = [ + "lm_head.weight", + "text_decoder.embed_tokens.weight", + ] + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.__init__ with SeamlessM4T->SeamlessM4Tv2 + def __init__(self, config: SeamlessM4Tv2Config): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config) + self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_encoder + def get_encoder(self): + return self.speech_encoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_decoder + def get_decoder(self): + return self.text_decoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_input_embeddings + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_input_embeddings + def set_input_embeddings(self, value): + self.text_decoder.embed_tokens = value + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._tie_weights + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS) + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.forward + def forward( + self, + input_features: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.speech_encoder( + input_features=input_features, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_outputs[0].device + ) + encoder_attention_mask = _compute_new_attention_mask( + hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.generate + def generate( + self, + input_features=None, + tgt_lang=None, + generation_config=None, + logits_processor=None, + stopping_criteria=None, + prefix_allowed_tokens_fn=None, + synced_gpus=False, + **kwargs, + ): + """ + Generates sequences of token ids. + + + + Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the + model's default generation configuration. You can override any `generation_config` by passing the corresponding + parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Parameters: + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. `**kwargs` + passed to generate matching the attributes of `generation_config` will override them. If + `generation_config` is not provided, the default will be used, which had the following loading + priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model + configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s + default values, whose documentation should be checked to parameterize generation. + logits_processor (`LogitsProcessorList`, *optional*): + Custom logits processors that complement the default logits processors built from arguments and + generation config. If a logit processor is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + stopping_criteria (`StoppingCriteriaList`, *optional*): + Custom stopping criteria that complement the default stopping criteria built from arguments and a + generation config. If a stopping criteria is passed that is already created with the arguments or a + generation config an error is thrown. This feature is intended for advanced users. + prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*): + If provided, this function constraints the beam search to allowed tokens only at each step. If not + provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and + `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned + on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful + for constrained generation conditioned on the prefix, as described in [Autoregressive Entity + Retrieval](https://huggingface.co/papers/2010.00904). + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed to avoid deadlocking with + `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). + kwargs (`dict[str, Any]`, *optional*): + Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be + forwarded to the `forward` function of the model. + + Return: + [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` + or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible + [`~utils.ModelOutput`] types are: + - [`~generation.GenerateEncoderDecoderOutput`], + - [`~generation.GenerateBeamEncoderDecoderOutput`] + """ + text_decoder_input_ids = kwargs.pop("decoder_input_ids", None) + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + input_features = input_features if input_features is not None else kwargs.pop("inputs") + if tgt_lang is not None: + inputs = kwargs.get("input_embeds") if input_features is None else input_features + inputs = ( + inputs + if inputs is not None + else kwargs.get("encoder_outputs", {"last_hidden_state": None})["last_hidden_state"] + ) + batch_size = len(inputs) + + if hasattr(self.generation_config, "text_decoder_lang_to_code_id"): + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in + {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}""" + ) + # tgt_lang gets priority over decoder input ids + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + else: + raise ValueError( + """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps + the target language to the right token id. Make sure to load the right generation config.""" + ) + else: + # only a warning, otherwise errors appear in the tests + logger.warning( + """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get + a correct generation, otherwise the generation will probably make no sense.""" + ) + return super().generate( + input_features, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + decoder_input_ids=text_decoder_input_ids, + **kwargs, + ) + + +@auto_docstring( + custom_intro=""" + The text-to-speech SeamlessM4Tv2 Model transformer which can be used for T2ST. + """ +) +class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["speech_encoder"] + main_input_name = "input_ids" + + _tied_weights_keys = [ + "lm_head.weight", + "text_encoder.embed_tokens.weight", + "text_decoder.embed_tokens.weight", + ] + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2 + def __init__(self, config: SeamlessM4Tv2Config): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + + self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared) + self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config) + self.vocoder = SeamlessM4Tv2CodeHifiGan(config) + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_encoder + def get_encoder(self): + return self.text_encoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_decoder + def get_decoder(self): + return self.text_decoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_input_embeddings + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_input_embeddings + def set_input_embeddings(self, value): + self.text_encoder.embed_tokens = value + self.text_decoder.embed_tokens = value + self.shared = value + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._tie_weights + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS) + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.forward with SeamlessM4T->SeamlessM4Tv2 + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This is the same forward method as `SeamlessM4Tv2ForTextToText`." + "It doesn't use the text-to-unit model `SeamlessM4Tv2TextToUnitForConditionalGeneration`." + "If you want to generate speech, use the `.generate` method." + ) + encoder_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + return_intermediate_token_ids: Optional[bool] = None, + tgt_lang: Optional[str] = None, + speaker_id: Optional[int] = 0, + **kwargs, + ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]: + """ + Generates translated audio waveforms. + + + + This method successively calls the `.generate` function of two different sub-models. You can specify keyword + arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments + that will be passed to one of them. + + For example, calling `.generate(input_ids, num_beams=4, speech_do_sample=True)` will successively perform + beam-search decoding on the text model, and multinomial beam-search sampling on the speech model. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + return_intermediate_token_ids (`bool`, *optional*): + If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want + to get translated text alongside the audio. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + speaker_id (`int`, *optional*, defaults to 0): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword + arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + + + Returns: + `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`: + - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`]. + - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size, + sequence_length)` and `waveform_lengths` which gives the length of each sample. + """ + batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")) + + if tgt_lang is None: + raise ValueError("You must specify a `tgt_lang` to generate translated speech.") + else: + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]: + lang_code_to_id = getattr(self.generation_config, key, None) + if lang_code_to_id is None: + raise ValueError( + f"""This model generation config doesn't have a `{key}` key which maps the target language + to the right token id. Make sure to load the right generation config.""" + ) + elif tgt_lang not in lang_code_to_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. + Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports + more languages for text translation than for speech synthesis.""" + ) + + kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs) + kwargs_text["output_hidden_states"] = True + kwargs_text["return_dict_in_generate"] = True + kwargs_text["output_scores"] = True + + text_decoder_input_ids = kwargs_text.get("decoder_input_ids") + + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + + kwargs_text["decoder_input_ids"] = text_decoder_input_ids + + # first generation + text_generation_output = super().generate(input_ids, **kwargs_text) + sequences = text_generation_output.sequences + + # prepare second generation + num_return_sequences = len(sequences) // batch_size + attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None)) + + if attention_mask is not None: + # repeat attention mask alongside batch dimension + attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0) + encoder_hidden_states = text_generation_output.encoder_hidden_states[-1] + + # repeat attention mask alongside batch dimension + encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0) + + # get decoder last hidden state - must do a pass through the text decoder + t2u_input_embeds = self.text_decoder( + input_ids=sequences[:, :-1], # Manually trim the final EOS token + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + ).last_hidden_state + + pad_token_id = self.generation_config.pad_token_id + + # Compute new attention mask + seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1) + t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens) + kwargs_speech["attention_mask"] = t2u_model_attention_mask + + # REMOVE EOS and lang_id + t2u_input_ids = sequences[:, 2:-1] + # replace every other EOS + t2u_input_ids = torch.masked_fill( + t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id + ) + + # compute t2u_char_input_ids + t2u_subwords = self._indices_to_subwords(t2u_input_ids) + t2u_char_count_per_id = self._count_character_length_in_subword( + t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id + ) + + # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode. + pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1)) + t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1) + t2u_char_input_ids = self._get_char_input_ids( + t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id + ) + + # second pass + t2u_output = self.t2u_model( + inputs_embeds=t2u_input_embeds, + char_input_ids=t2u_char_input_ids, + char_count_per_id=t2u_char_count_per_id, + **kwargs_speech, + ) + + t2u_logits = t2u_output[0] + padding_mask = t2u_output[1].bool() + + # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature + temperature = kwargs_speech.get("temperature", None) + if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False): + unit_ids = t2u_logits.argmax(dim=-1) + else: + t2u_logits = t2u_logits / temperature + # apply softmax + probs = nn.functional.softmax(t2u_logits, dim=-1) + # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size) + probs = probs.reshape((-1, probs.shape[2])) + # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len) + unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1) + + output_unit_ids = unit_ids.detach().clone() + + replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask) + # replace eos per pad + unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id) + + # offset of control symbols + unit_ids = torch.where( + unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset + ) + + vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang) + vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device) + + speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device) + + waveform, waveform_lengths = self.vocoder( + input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id + ) + + if return_intermediate_token_ids: + return SeamlessM4Tv2GenerationOutput( + waveform=waveform, + waveform_lengths=waveform_lengths, + sequences=sequences, + unit_sequences=output_unit_ids, + ) + + return waveform, waveform_lengths + + +@auto_docstring( + custom_intro=""" + The speech-to-speech SeamlessM4Tv2 Model transformer which can be used for S2ST. + """ +) +class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + _keys_to_ignore_on_load_missing = ["text_encoder"] + main_input_name = "input_features" + + _tied_weights_keys = [ + "lm_head.weight", + "text_decoder.embed_tokens.weight", + ] + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2 + def __init__(self, config): + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config) + self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config) + self.vocoder = SeamlessM4Tv2CodeHifiGan(config) + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_encoder + def get_encoder(self): + return self.speech_encoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_decoder + def get_decoder(self): + return self.text_decoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_input_embeddings + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_input_embeddings + def set_input_embeddings(self, value): + self.text_decoder.embed_tokens = value + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech._tie_weights + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS) + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.forward with SeamlessM4T->SeamlessM4Tv2 + def forward( + self, + input_features: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This is the same forward method as `SeamlessM4Tv2ForSpeechToText`. It doesn't use `self.t2u_model`." + "If you want to generate speech, use the `generate` method." + ) + + encoder_outputs = self.speech_encoder( + input_features=input_features, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_outputs[0].device + ) + encoder_attention_mask = _compute_new_attention_mask( + hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_features: Optional[torch.Tensor] = None, + return_intermediate_token_ids: Optional[bool] = None, + tgt_lang: Optional[str] = None, + speaker_id: Optional[int] = 0, + **kwargs, + ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]: + """ + Generates translated audio waveforms. + + + + This method successively calls the `.generate` function of two different sub-models. You can specify keyword + arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments + that will be passed to one of them. + + For example, calling `.generate(input_features, num_beams=4, speech_do_sample=True)` will successively perform + beam-search decoding on the text model, and multinomial beam-search sampling on the speech model. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + Args: + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + return_intermediate_token_ids (`bool`, *optional*): + If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want + to get translated text alongside the audio. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + speaker_id (`int`, *optional*, defaults to 0): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + + kwargs (*optional*): + Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword + arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + + + Returns: + `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`: + - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`]. + - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size, + sequence_length)` and `waveform_lengths` which gives the length of each sample. + """ + batch_size = len(input_features) if input_features is not None else len(kwargs.get("inputs_embeds")) + + if tgt_lang is None: + raise ValueError("You must specify a `tgt_lang` to generate translated speech.") + else: + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]: + lang_code_to_id = getattr(self.generation_config, key, None) + if lang_code_to_id is None: + raise ValueError( + f"""This model generation config doesn't have a `{key}` key which maps the target language + to the right token id. Make sure to load the right generation config.""" + ) + elif tgt_lang not in lang_code_to_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. + Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports + more languages for text translation than for speech synthesis.""" + ) + + kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs) + kwargs_text["output_hidden_states"] = True + kwargs_text["return_dict_in_generate"] = True + kwargs_text["output_scores"] = True + + text_decoder_input_ids = kwargs_text.get("decoder_input_ids") + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + + kwargs_text["decoder_input_ids"] = text_decoder_input_ids + + # first generation + text_generation_output = super().generate(input_features, **kwargs_text) + sequences = text_generation_output.sequences + + # prepare second generation + num_return_sequences = len(sequences) // batch_size + attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None)) + + # get last_hidden_state from encoder + encoder_hidden_states = self.speech_encoder(input_features=input_features, attention_mask=attention_mask)[0] + + # input modality = speech so new attention mask for the decoder + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_hidden_states.device + ) + attention_mask = _compute_new_attention_mask( + hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths + ) + + # repeat attention mask alongside batch dimension + attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0) + + # repeat attention mask alongside batch dimension + encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0) + + # get decoder last hidden state - must do a pass through the text decoder + t2u_input_embeds = self.text_decoder( + input_ids=sequences[:, :-1], # Manually trim the final EOS token + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + ).last_hidden_state + + pad_token_id = self.generation_config.pad_token_id + + # Compute new attention mask + seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1) + t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens) + kwargs_speech["attention_mask"] = t2u_model_attention_mask + + # REMOVE EOS and lang_id + t2u_input_ids = sequences[:, 2:-1] + # replace every other EOS + t2u_input_ids = torch.masked_fill( + t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id + ) + + # compute t2u_char_input_ids + t2u_subwords = self._indices_to_subwords(t2u_input_ids) + t2u_char_count_per_id = self._count_character_length_in_subword( + t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id + ) + + # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode. + pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1)) + t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1) + t2u_char_input_ids = self._get_char_input_ids( + t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id + ) + + # second pass + t2u_output = self.t2u_model( + inputs_embeds=t2u_input_embeds, + char_input_ids=t2u_char_input_ids, + char_count_per_id=t2u_char_count_per_id, + **kwargs_speech, + ) + + t2u_logits = t2u_output[0] + padding_mask = t2u_output[1].bool() + + # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature + temperature = kwargs_speech.get("temperature", None) + if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False): + unit_ids = t2u_logits.argmax(dim=-1) + else: + t2u_logits = t2u_logits / temperature + # apply softmax + probs = nn.functional.softmax(t2u_logits, dim=-1) + # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size) + probs = probs.reshape((-1, probs.shape[2])) + # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len) + unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1) + + output_unit_ids = unit_ids.detach().clone() + + replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask) + # replace eos per pad + unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id) + + # offset of control symbols + unit_ids = torch.where( + unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset + ) + + vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang) + vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device) + + speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device) + + waveform, waveform_lengths = self.vocoder( + input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id + ) + + if return_intermediate_token_ids: + return SeamlessM4Tv2GenerationOutput( + waveform=waveform, + waveform_lengths=waveform_lengths, + sequences=sequences, + unit_sequences=output_unit_ids, + ) + + return waveform, waveform_lengths + + +@auto_docstring( + custom_intro=""" + The original SeamlessM4Tv2 Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST). + """ +) +class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin): + _tied_weights_keys = [ + "lm_head.weight", + "text_encoder.embed_tokens.weight", + "text_decoder.embed_tokens.weight", + ] + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.__init__ with SeamlessM4T->SeamlessM4Tv2 + def __init__(self, config, current_modality="text"): + r""" + current_modality (`str`, *optional*, defaults to `"text"`): + Default modality. Used to initialize the model. + """ + super().__init__(config) + + self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + + self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared) + self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config) + self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + self.current_modality = current_modality + if current_modality == "speech": + self.main_input_name = "input_features" + + # these models already call post_init in their initialization + self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config) + self.vocoder = SeamlessM4Tv2CodeHifiGan(config) + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_modality + def set_modality(self, modality="text"): + if modality == "text": + self.main_input_name = "input_ids" + self.current_modality = "text" + elif modality == "speech": + self.main_input_name = "input_features" + self.current_modality = "speech" + else: + raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.") + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_encoder + def get_encoder(self): + if self.current_modality == "text": + return self.text_encoder + else: + return self.speech_encoder + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_input_embeddings + def get_input_embeddings(self): + return self.text_decoder.embed_tokens + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_input_embeddings + def set_input_embeddings(self, value): + self.text_encoder.embed_tokens = value + self.text_decoder.embed_tokens = value + self.shared = value + + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._tie_weights + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.lm_head, self.shared) + + @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS) + # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.forward with SeamlessM4T->SeamlessM4Tv2 + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + if input_ids is None and input_features is None and inputs_embeds is None and encoder_outputs is None: + raise ValueError( + "`input_ids`,`input_features`, `inputs_embeds` and `encoder_outputs` are all empty. Make sure at least one of them is not." + ) + elif input_features is not None: + if input_ids is not None: + logger.warning( + "`input_ids` is not `None` but `input_features` has been given." + "`input_features` will be used in priority through the `speech_encoder`. " + "Make sure that `input_features` and `input_ids` are mutually exclusive." + ) + + if inputs_embeds is not None: + logger.warning( + "`inputs_embeds` is not `None` but `input_features` has been given." + "`input_features` will be used in priority through `speech_encoder`. " + "`inputs_embeds` will be ignored." + ) + + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`" + "depending on the input modality. If you want to generate speech, use the `generate` method." + ) + + self.set_modality("speech") + + encoder_outputs = self.speech_encoder( + input_features=input_features, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + elif input_ids is not None or inputs_embeds is not None: + # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn + logger.warning( + "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`" + "depending on the input modality. If you want to generate speech, use the `generate` method." + ) + self.set_modality("text") + encoder_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + encoder_attention_mask = attention_mask + # input modality = speech so new attention mask + if self.current_modality == "speech" and attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_outputs[0].device + ) + encoder_attention_mask = _compute_new_attention_mask( + hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(decoder_outputs[0]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + labels = labels.to(lm_logits.device) + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + outputs = decoder_outputs + encoder_outputs + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + input_features: Optional[torch.Tensor] = None, + return_intermediate_token_ids: Optional[bool] = None, + tgt_lang: Optional[str] = None, + speaker_id: Optional[int] = 0, + generate_speech: Optional[bool] = True, + **kwargs, + ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]: + """ + Generates translated token ids and/or translated audio waveforms. + + + + This method successively calls the `.generate` function of two different sub-models. You can specify keyword + arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments + that will be passed to one of them. + + For example, calling `.generate(input_ids=input_ids, num_beams=4, speech_do_sample=True)` will successively + perform beam-search decoding on the text model, and multinomial beam-search sampling on the speech model. + + For an overview of generation strategies and code examples, check out the [following + guide](./generation_strategies). + + + + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*): + Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the + [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details. + return_intermediate_token_ids (`bool`, *optional*): + If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want + to get translated text alongside the audio. + Note that if `generate_speech=False`, this parameter will be ignored and + the text tokens are returned. + tgt_lang (`str`, *optional*): + The language to use as target language for translation. + speaker_id (`int`, *optional*, defaults to 0): + The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`. + generate_speech (`bool`, *optional*, defaults to `True`): + If `False`, will only returns the text tokens and won't generate speech. + + kwargs (*optional*): + Remaining dictioy of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword + arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model, + except for `decoder_input_ids` which will only be passed through the text components. + - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the + text model and speech model respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for one generation but not for the + other. + + Returns: + `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor], ModelOutput]`: + - If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`]. + - If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of + shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample. + - If `generate_speech=False`, it will returns `ModelOutput`. + """ + if input_ids is None and input_features is None and kwargs.get("inputs_embeds") is None: + raise ValueError( + "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not." + ) + + if generate_speech and tgt_lang is None: + raise ValueError("You must specify a `tgt_lang` to generate translated speech.") + + if tgt_lang is not None: + # also accept __xxx__ + tgt_lang = tgt_lang.replace("__", "") + if generate_speech: + keys_to_check = ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"] + else: + keys_to_check = ["text_decoder_lang_to_code_id"] + for key in keys_to_check: + lang_code_to_id = getattr(self.generation_config, key, None) + if lang_code_to_id is None: + raise ValueError( + f"""This model generation config doesn't have a `{key}` key which maps the target language + to the right token id. Make sure to load the right generation config.""" + ) + elif tgt_lang not in lang_code_to_id: + raise ValueError( + f"""`tgt_lang={tgt_lang}` is not supported by this model. + Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports + more languages for text translation than for speech synthesis.""" + ) + + batch_size = ( + len(input_features) + if input_features is not None + else (len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))) + ) + + kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs) + kwargs_text["output_hidden_states"] = True + kwargs_text["return_dict_in_generate"] = True + kwargs_text["output_scores"] = True + + text_decoder_input_ids = kwargs_text.get("decoder_input_ids") + # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids. + if tgt_lang is not None: + # tgt_lang gets priority over decoder input ids + text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang) + text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device) + + kwargs_text["decoder_input_ids"] = text_decoder_input_ids + + # first generation + if input_features is not None: + self.set_modality("speech") + if input_ids is not None: + logger.warning( + "`input_features` and `input_ids` are both non empty. `input_features` will be used in priority " + "through the speech encoder. Make sure `input_features=None` if you want to use the text encoder." + ) + text_generation_output = super().generate(input_features=input_features, **kwargs_text) + else: + self.set_modality("text") + text_generation_output = super().generate(input_ids=input_ids, input_features=None, **kwargs_text) + sequences = text_generation_output.sequences + + if not generate_speech: + return text_generation_output + + # prepare second generation + num_return_sequences = len(sequences) // batch_size + attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None)) + + # get encoder last hidden states + if self.current_modality == "speech": + # get last_hidden_state from encoder - must do a pass through the speech encoder + encoder_hidden_states = self.speech_encoder( + input_features=input_features, attention_mask=attention_mask + ).last_hidden_state + + # input modality = speech so new attention mask for the decoder + if attention_mask is not None: + sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to( + encoder_hidden_states.device + ) + attention_mask = _compute_new_attention_mask( + hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths + ) + else: + encoder_hidden_states = text_generation_output.encoder_hidden_states[-1] + + if attention_mask is not None: + # repeat attention mask alongside batch dimension + attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0) + + # repeat attention mask alongside batch dimension + encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0) + + # get decoder last hidden state - must do a pass through the text decoder + t2u_input_embeds = self.text_decoder( + input_ids=sequences[:, :-1], # Manually trim the final EOS token + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + ).last_hidden_state + + pad_token_id = self.generation_config.pad_token_id + + # Compute new attention mask + seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1) + t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens) + kwargs_speech["attention_mask"] = t2u_model_attention_mask + + # REMOVE EOS and lang_id + t2u_input_ids = sequences[:, 2:-1] + # replace every other EOS + t2u_input_ids = torch.masked_fill( + t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id + ) + + # compute t2u_char_input_ids + t2u_subwords = self._indices_to_subwords(t2u_input_ids) + t2u_char_count_per_id = self._count_character_length_in_subword( + t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id + ) + + # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode. + pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1)) + t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1) + t2u_char_input_ids = self._get_char_input_ids( + t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id + ) + + # second pass + t2u_output = self.t2u_model( + inputs_embeds=t2u_input_embeds, + char_input_ids=t2u_char_input_ids, + char_count_per_id=t2u_char_count_per_id, + **kwargs_speech, + ) + + t2u_logits = t2u_output[0] + padding_mask = t2u_output[1].bool() + + # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature + temperature = kwargs_speech.get("temperature", None) + if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False): + unit_ids = t2u_logits.argmax(dim=-1) + else: + t2u_logits = t2u_logits / temperature + # apply softmax + probs = nn.functional.softmax(t2u_logits, dim=-1) + # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size) + probs = probs.reshape((-1, probs.shape[2])) + # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len) + unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1) + + output_unit_ids = unit_ids.detach().clone() + + replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask) + # replace eos per pad + unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id) + + # offset of control symbols + unit_ids = torch.where( + unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset + ) + + vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang) + vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device) + + speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device) + + waveform, waveform_lengths = self.vocoder( + input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id + ) + + if return_intermediate_token_ids: + return SeamlessM4Tv2GenerationOutput( + waveform=waveform, + waveform_lengths=waveform_lengths, + sequences=sequences, + unit_sequences=output_unit_ids, + ) + + return waveform, waveform_lengths + + +__all__ = [ + "SeamlessM4Tv2ForTextToSpeech", + "SeamlessM4Tv2ForSpeechToSpeech", + "SeamlessM4Tv2ForTextToText", + "SeamlessM4Tv2ForSpeechToText", + "SeamlessM4Tv2Model", + "SeamlessM4Tv2PreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2968e9ec3e72ce365688284ea83c7b8743203ca5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/configuration_seed_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/configuration_seed_oss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53e1fb2247ea7de52f7f235261be7c601da30f78 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/configuration_seed_oss.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modeling_seed_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modeling_seed_oss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc3bf785bcff47269b00a5f5bf50c351763d1d39 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modeling_seed_oss.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modular_seed_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modular_seed_oss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09b22fc2b38ad84c6d70e03dab46f0d6b5a89b0f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modular_seed_oss.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6e7fcb016c02c39d92c6bfa3e16f3375ff7d2b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/configuration_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/configuration_segformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6defbc4eb8ed4b0e9d2d777517076cf514e52828 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/configuration_segformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/feature_extraction_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/feature_extraction_segformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f457088952f34497bc70313a351f12d1319edc7d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/feature_extraction_segformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9bd902594f20f523909b4d03bfc03a3a8afe861 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d41073e3aa839a57e1ba05378cb6c5ee442cc6ce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_segformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e27e305356e9af070c0e779788def9720c967a1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_segformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_tf_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_tf_segformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63a45366939b884fa97c08311f43d29c39d324c2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_tf_segformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modular_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modular_segformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..396606b9c2b10d4bfe4643eb2d755f847cd485d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modular_segformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/image_processing_segformer_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/image_processing_segformer_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..36befda0eaa3f80e8f21db9bd9167f98ceb19948 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/image_processing_segformer_fast.py @@ -0,0 +1,234 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/segformer/modular_segformer.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_segformer.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, + is_torch_tensor, +) +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring + + +class SegformerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + do_reduce_labels: Optional[bool] + + +@auto_docstring +class SegformerImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + size = {"height": 512, "width": 512} + default_to_square = True + crop_size = None + do_resize = True + do_center_crop = None + do_rescale = True + do_normalize = True + do_reduce_labels = False + valid_kwargs = SegformerFastImageProcessorKwargs + rescale_factor = 1 / 255 + + def __init__(self, **kwargs: Unpack[SegformerFastImageProcessorKwargs]): + super().__init__(**kwargs) + + def reduce_label(self, labels: list["torch.Tensor"]): + for idx in range(len(labels)): + label = labels[idx] + label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype), label) + label = label - 1 + label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype), label) + labels[idx] = label + + return label + + @auto_docstring + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + **kwargs: Unpack[SegformerFastImageProcessorKwargs], + ) -> BatchFeature: + r""" + segmentation_maps (`ImageInput`, *optional*): + The segmentation maps to preprocess. + """ + return super().preprocess(images, segmentation_maps, **kwargs) + + def _preprocess_image_like_inputs( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput], + do_convert_rgb: bool, + input_data_format: ChannelDimension, + device: Optional[Union[str, "torch.device"]] = None, + **kwargs: Unpack[SegformerFastImageProcessorKwargs], + ) -> BatchFeature: + """ + Preprocess image-like inputs. + """ + images = self._prepare_image_like_inputs( + images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device + ) + images_kwargs = kwargs.copy() + images_kwargs["do_reduce_labels"] = False + batch_feature = self._preprocess(images, **images_kwargs) + + if segmentation_maps is not None: + processed_segmentation_maps = self._prepare_image_like_inputs( + images=segmentation_maps, + expected_ndims=2, + do_convert_rgb=False, + input_data_format=ChannelDimension.FIRST, + ) + + segmentation_maps_kwargs = kwargs.copy() + segmentation_maps_kwargs.update( + { + "do_normalize": False, + "do_rescale": False, + # Nearest interpolation is used for segmentation maps instead of BILINEAR. + "interpolation": F.InterpolationMode.NEAREST_EXACT, + } + ) + processed_segmentation_maps = self._preprocess( + images=processed_segmentation_maps, **segmentation_maps_kwargs + ).pixel_values + batch_feature["labels"] = processed_segmentation_maps.squeeze(1).to(torch.int64) + + return batch_feature + + def _preprocess( + self, + images: list["torch.Tensor"], + do_reduce_labels: bool, + interpolation: Optional["F.InterpolationMode"], + do_resize: bool, + do_rescale: bool, + do_normalize: bool, + size: SizeDict, + rescale_factor: float, + image_mean: Union[float, list[float]], + image_std: Union[float, list[float]], + disable_grouping: bool, + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: # Return type can be list if return_tensors=None + if do_reduce_labels: + images = self.reduce_label(images) # Apply reduction if needed + + # Group images by size for batched resizing + resized_images = images + if do_resize: + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + resized_stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = resized_stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing (rescale/normalize) + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + + # Stack images into a single tensor if return_tensors is set + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None): + """ + Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`SegformerForSemanticSegmentation`]): + Raw outputs of the model. + target_sizes (`list[Tuple]` of length `batch_size`, *optional*): + List of tuples corresponding to the requested final size (height, width) of each prediction. If unset, + predictions will not be resized. + + Returns: + semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic + segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is + specified). Each entry of each `torch.Tensor` correspond to a semantic class id. + """ + # TODO: add support for other frameworks + logits = outputs.logits + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + if is_torch_tensor(target_sizes): + target_sizes = target_sizes.numpy() + + semantic_segmentation = [] + + for idx in range(len(logits)): + resized_logits = torch.nn.functional.interpolate( + logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = logits.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + +__all__ = ["SegformerImageProcessorFast"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/modeling_tf_segformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/modeling_tf_segformer.py new file mode 100644 index 0000000000000000000000000000000000000000..2f8e68b957487a172101d8b32241a572c71ddf02 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/modeling_tf_segformer.py @@ -0,0 +1,1044 @@ +# coding=utf-8 +# Copyright 2022 NVIDIA The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow SegFormer model.""" + +from __future__ import annotations + +import math + +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput +from ...modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceClassificationLoss, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import shape_list, stable_softmax +from ...utils import logging +from .configuration_segformer import SegformerConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "SegformerConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "nvidia/mit-b0" +_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->Segformer +class TFSegformerDropPath(keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_path: float, **kwargs): + super().__init__(**kwargs) + self.drop_path = drop_path + + def call(self, x: tf.Tensor, training=None): + if training: + keep_prob = 1 - self.drop_path + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + + +class TFSegformerOverlapPatchEmbeddings(keras.layers.Layer): + """Construct the overlapping patch embeddings.""" + + def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs): + super().__init__(**kwargs) + self.padding = keras.layers.ZeroPadding2D(padding=patch_size // 2) + self.proj = keras.layers.Conv2D( + filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj" + ) + + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") + self.num_channels = num_channels + self.hidden_size = hidden_size + + def call(self, pixel_values: tf.Tensor) -> tuple[tf.Tensor, int, int]: + embeddings = self.proj(self.padding(pixel_values)) + height = shape_list(embeddings)[1] + width = shape_list(embeddings)[2] + hidden_dim = shape_list(embeddings)[3] + # (batch_size, height, width, num_channels) -> (batch_size, height*width, num_channels) + # this can be fed to a Transformer layer + embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim)) + embeddings = self.layer_norm(embeddings) + return embeddings, height, width + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "proj", None) is not None: + with tf.name_scope(self.proj.name): + self.proj.build([None, None, None, self.num_channels]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.hidden_size]) + + +class TFSegformerEfficientSelfAttention(keras.layers.Layer): + """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT + paper](https://huggingface.co/papers/2102.12122).""" + + def __init__( + self, + config: SegformerConfig, + hidden_size: int, + num_attention_heads: int, + sequence_reduction_ratio: int, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})" + ) + + self.attention_head_size = self.hidden_size // self.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense(self.all_head_size, name="query") + self.key = keras.layers.Dense(self.all_head_size, name="key") + self.value = keras.layers.Dense(self.all_head_size, name="value") + + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) + + self.sr_ratio = sequence_reduction_ratio + if sequence_reduction_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=hidden_size, kernel_size=sequence_reduction_ratio, strides=sequence_reduction_ratio, name="sr" + ) + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") + + def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] + # to [batch_size, seq_length, num_attention_heads, attention_head_size] + batch_size = shape_list(tensor)[0] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] + # to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + height: int, + width: int, + output_attentions: bool = False, + training: bool = False, + ) -> tf.Tensor | tuple[tf.Tensor, tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + num_channels = shape_list(hidden_states)[2] + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + if self.sr_ratio > 1: + # Reshape to (batch_size, height, width, num_channels) + hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels)) + # Apply sequence reduction + hidden_states = self.sr(hidden_states) + # Reshape back to (batch_size, seq_len, num_channels) + hidden_states = tf.reshape(hidden_states, (batch_size, -1, num_channels)) + hidden_states = self.layer_norm(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + + scale = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, scale) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + context_layer = tf.matmul(attention_probs, value_layer) + + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + # (batch_size, seq_len_q, all_head_size) + context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size)) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.hidden_size]) + if getattr(self, "sr", None) is not None: + with tf.name_scope(self.sr.name): + self.sr.build([None, None, None, self.hidden_size]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.hidden_size]) + + +class TFSegformerSelfOutput(keras.layers.Layer): + def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense(hidden_size, name="dense") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.hidden_size = hidden_size + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.hidden_size]) + + +class TFSegformerAttention(keras.layers.Layer): + def __init__( + self, + config: SegformerConfig, + hidden_size: int, + num_attention_heads: int, + sequence_reduction_ratio: int, + **kwargs, + ): + super().__init__(**kwargs) + self.self = TFSegformerEfficientSelfAttention( + config=config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + name="self", + ) + self.dense_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output") + + def call( + self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False + ) -> tf.Tensor | tuple[tf.Tensor, tf.Tensor]: + self_outputs = self.self(hidden_states, height, width, output_attentions) + + attention_output = self.dense_output(self_outputs[0]) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +class TFSegformerDWConv(keras.layers.Layer): + def __init__(self, dim: int = 768, **kwargs): + super().__init__(**kwargs) + self.depthwise_convolution = keras.layers.Conv2D( + filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv" + ) + self.dim = dim + + def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor: + batch_size = shape_list(hidden_states)[0] + num_channels = shape_list(hidden_states)[-1] + hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels)) + hidden_states = self.depthwise_convolution(hidden_states) + + new_height = shape_list(hidden_states)[1] + new_width = shape_list(hidden_states)[2] + num_channels = shape_list(hidden_states)[3] + hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels)) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "depthwise_convolution", None) is not None: + with tf.name_scope(self.depthwise_convolution.name): + self.depthwise_convolution.build([None, None, None, self.dim]) + + +class TFSegformerMixFFN(keras.layers.Layer): + def __init__( + self, + config: SegformerConfig, + in_features: int, + hidden_features: int | None = None, + out_features: int | None = None, + **kwargs, + ): + super().__init__(**kwargs) + out_features = out_features or in_features + self.dense1 = keras.layers.Dense(hidden_features, name="dense1") + self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv") + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.dense2 = keras.layers.Dense(out_features, name="dense2") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.hidden_features = hidden_features + self.in_features = in_features + + def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor: + hidden_states = self.dense1(hidden_states) + hidden_states = self.depthwise_convolution(hidden_states, height, width) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.dense2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense1", None) is not None: + with tf.name_scope(self.dense1.name): + self.dense1.build([None, None, self.in_features]) + if getattr(self, "depthwise_convolution", None) is not None: + with tf.name_scope(self.depthwise_convolution.name): + self.depthwise_convolution.build(None) + if getattr(self, "dense2", None) is not None: + with tf.name_scope(self.dense2.name): + self.dense2.build([None, None, self.hidden_features]) + + +class TFSegformerLayer(keras.layers.Layer): + """This corresponds to the Block class in the original implementation.""" + + def __init__( + self, + config, + hidden_size: int, + num_attention_heads: int, + drop_path: float, + sequence_reduction_ratio: int, + mlp_ratio: int, + **kwargs, + ): + super().__init__(**kwargs) + self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1") + self.attention = TFSegformerAttention( + config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + name="attention", + ) + self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear") + self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2") + mlp_hidden_size = int(hidden_size * mlp_ratio) + self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp") + self.hidden_size = hidden_size + + def call( + self, + hidden_states: tf.Tensor, + height: int, + width: int, + output_attentions: bool = False, + training: bool = False, + ) -> tuple: + self_attention_outputs = self.attention( + self.layer_norm_1(hidden_states), # in Segformer, layernorm is applied before self-attention + height, + width, + output_attentions=output_attentions, + training=training, + ) + + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection (with stochastic depth) + attention_output = self.drop_path(attention_output, training=training) + hidden_states = attention_output + hidden_states + mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width) + + # second residual connection (with stochastic depth) + mlp_output = self.drop_path(mlp_output, training=training) + layer_output = mlp_output + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norm_1", None) is not None: + with tf.name_scope(self.layer_norm_1.name): + self.layer_norm_1.build([None, None, self.hidden_size]) + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "layer_norm_2", None) is not None: + with tf.name_scope(self.layer_norm_2.name): + self.layer_norm_2.build([None, None, self.hidden_size]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + + +class TFSegformerEncoder(keras.layers.Layer): + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + + # stochastic depth decay rule + drop_path_decays = [x.numpy() for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] + + # patch embeddings + embeddings = [] + for i in range(config.num_encoder_blocks): + embeddings.append( + TFSegformerOverlapPatchEmbeddings( + patch_size=config.patch_sizes[i], + stride=config.strides[i], + num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1], + hidden_size=config.hidden_sizes[i], + name=f"patch_embeddings.{i}", + ) + ) + self.embeddings = embeddings + + # Transformer blocks + blocks = [] + cur = 0 + for i in range(config.num_encoder_blocks): + # each block consists of layers + layers = [] + if i != 0: + cur += config.depths[i - 1] + for j in range(config.depths[i]): + layers.append( + TFSegformerLayer( + config, + hidden_size=config.hidden_sizes[i], + num_attention_heads=config.num_attention_heads[i], + drop_path=drop_path_decays[cur + j], + sequence_reduction_ratio=config.sr_ratios[i], + mlp_ratio=config.mlp_ratios[i], + name=f"block.{i}.{j}", + ) + ) + blocks.append(layers) + + self.block = blocks + + # Layer norms + self.layer_norms = [ + keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}") + for i in range(config.num_encoder_blocks) + ] + + def call( + self, + pixel_values: tf.Tensor, + output_attentions: bool | None = False, + output_hidden_states: bool | None = False, + return_dict: bool | None = True, + training: bool = False, + ) -> tuple | TFBaseModelOutput: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + batch_size = shape_list(pixel_values)[0] + + hidden_states = pixel_values + for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)): + embedding_layer, block_layer, norm_layer = x + # first, obtain patch embeddings + hidden_states, height, width = embedding_layer(hidden_states) + + # second, send embeddings through blocks + # (each block consists of multiple layers i.e., list of layers) + for i, blk in enumerate(block_layer): + layer_outputs = blk( + hidden_states, + height, + width, + output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + # third, apply layer norm + hidden_states = norm_layer(hidden_states) + + # fourth, optionally reshape back to (batch_size, height, width, num_channels) + if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage): + num_channels = shape_list(hidden_states)[-1] + hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels)) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer_norms", None) is not None: + for layer, shape in zip(self.layer_norms, self.config.hidden_sizes): + with tf.name_scope(layer.name): + layer.build([None, None, shape]) + if getattr(self, "block", None) is not None: + for block in self.block: + for layer in block: + with tf.name_scope(layer.name): + layer.build(None) + if getattr(self, "embeddings", None) is not None: + for layer in self.embeddings: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFSegformerMainLayer(keras.layers.Layer): + config_class = SegformerConfig + + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + # hierarchical Transformer encoder + self.encoder = TFSegformerEncoder(config, name="encoder") + + @unpack_inputs + def call( + self, + pixel_values: tf.Tensor, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> tuple | TFBaseModelOutput: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + encoder_outputs = self.encoder( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = encoder_outputs[0] + # Change to NCHW output format to have uniformity in the modules + sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2]) + + # Change the other hidden state outputs to NCHW as well + if output_hidden_states: + hidden_states = tuple(tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]) + + if not return_dict: + if tf.greater(len(encoder_outputs[1:]), 0): + transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0]) + return (sequence_output,) + (transposed_encoder_outputs,) + else: + return (sequence_output,) + encoder_outputs[1:] + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + + +class TFSegformerPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SegformerConfig + base_model_prefix = "segformer" + main_input_name = "pixel_values" + + @property + def input_signature(self): + return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 512, 512), dtype=tf.float32)} + + +SEGFORMER_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`SegformerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +SEGFORMER_INPUTS_DOCSTRING = r""" + + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`SegformerImageProcessor.__call__`] for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.", + SEGFORMER_START_DOCSTRING, +) +class TFSegformerModel(TFSegformerPreTrainedModel): + def __init__(self, config: SegformerConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.config = config + + # hierarchical Transformer encoder + self.segformer = TFSegformerMainLayer(config, name="segformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def call( + self, + pixel_values: tf.Tensor, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> tuple | TFBaseModelOutput: + outputs = self.segformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "segformer", None) is not None: + with tf.name_scope(self.segformer.name): + self.segformer.build(None) + + +@add_start_docstrings( + """ + SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden + states) e.g. for ImageNet. + """, + SEGFORMER_START_DOCSTRING, +) +class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: SegformerConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.segformer = TFSegformerMainLayer(config, name="segformer") + + # Classifier head + self.classifier = keras.layers.Dense(config.num_labels, name="classifier") + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: tf.Tensor | None = None, + labels: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + ) -> tuple | TFSequenceClassifierOutput: + outputs = self.segformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + # convert last hidden states to (batch_size, height*width, hidden_size) + batch_size = shape_list(sequence_output)[0] + sequence_output = tf.transpose(sequence_output, perm=[0, 2, 3, 1]) + sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1])) + + # global average pooling + sequence_output = tf.reduce_mean(sequence_output, axis=1) + + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "segformer", None) is not None: + with tf.name_scope(self.segformer.name): + self.segformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_sizes[-1]]) + + +class TFSegformerMLP(keras.layers.Layer): + """ + Linear Embedding. + """ + + def __init__(self, input_dim: int, config: SegformerConfig, **kwargs): + super().__init__(**kwargs) + self.proj = keras.layers.Dense(config.decoder_hidden_size, name="proj") + self.input_dim = input_dim + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + height = shape_list(hidden_states)[1] + width = shape_list(hidden_states)[2] + hidden_dim = shape_list(hidden_states)[-1] + hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim)) + hidden_states = self.proj(hidden_states) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "proj", None) is not None: + with tf.name_scope(self.proj.name): + self.proj.build([None, None, self.input_dim]) + + +class TFSegformerDecodeHead(TFSegformerPreTrainedModel): + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(config, **kwargs) + # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size + mlps = [] + for i in range(config.num_encoder_blocks): + mlp = TFSegformerMLP(config=config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}") + mlps.append(mlp) + self.mlps = mlps + + # the following 3 layers implement the ConvModule of the original implementation + self.linear_fuse = keras.layers.Conv2D( + filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse" + ) + self.batch_norm = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm") + self.activation = keras.layers.Activation("relu") + + self.dropout = keras.layers.Dropout(config.classifier_dropout_prob) + self.classifier = keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier") + + self.config = config + + def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + all_hidden_states = () + for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps): + if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3: + height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32)) + height = width = tf.cast(height, tf.int32) + channel_dim = shape_list(encoder_hidden_state)[-1] + encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim)) + + # unify channel dimension + encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1]) + height, width = shape_list(encoder_hidden_state)[1:3] + encoder_hidden_state = mlp(encoder_hidden_state) + channel_dim = shape_list(encoder_hidden_state)[-1] + encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim)) + + # upsample + temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1]) + upsample_resolution = shape_list(temp_state)[1:-1] + encoder_hidden_state = tf.image.resize(encoder_hidden_state, size=upsample_resolution, method="bilinear") + all_hidden_states += (encoder_hidden_state,) + + hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1)) + hidden_states = self.batch_norm(hidden_states, training=training) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + + # logits of shape (batch_size, height/4, width/4, num_labels) + logits = self.classifier(hidden_states) + + return logits + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "linear_fuse", None) is not None: + with tf.name_scope(self.linear_fuse.name): + self.linear_fuse.build( + [None, None, None, self.config.decoder_hidden_size * self.config.num_encoder_blocks] + ) + if getattr(self, "batch_norm", None) is not None: + with tf.name_scope(self.batch_norm.name): + self.batch_norm.build([None, None, None, self.config.decoder_hidden_size]) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, None, self.config.decoder_hidden_size]) + if getattr(self, "mlps", None) is not None: + for layer in self.mlps: + with tf.name_scope(layer.name): + layer.build(None) + + +@add_start_docstrings( + """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""", + SEGFORMER_START_DOCSTRING, +) +class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel): + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(config, **kwargs) + self.segformer = TFSegformerMainLayer(config, name="segformer") + self.decode_head = TFSegformerDecodeHead(config, name="decode_head") + + def hf_compute_loss(self, logits, labels): + # upsample logits to the images' original size + # `labels` is of shape (batch_size, height, width) + label_interp_shape = shape_list(labels)[1:] + + upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear") + # compute weighted loss + loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") + + def masked_loss(real, pred): + unmasked_loss = loss_fct(real, pred) + mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype) + masked_loss = unmasked_loss * mask + # Reduction strategy in the similar spirit with + # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210 + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask) + return tf.reshape(reduced_masked_loss, (1,)) + + return masked_loss(labels, upsampled_logits) + + @unpack_inputs + @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: tf.Tensor, + labels: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + ) -> tuple | TFSemanticSegmenterOutput: + r""" + labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a (per-pixel) classification loss is computed + (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, TFSegformerForSemanticSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") + >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") + + >>> inputs = image_processor(images=image, return_tensors="tf") + >>> outputs = model(**inputs, training=False) + >>> # logits are of shape (batch_size, num_labels, height/4, width/4) + >>> logits = outputs.logits + >>> list(logits.shape) + [1, 150, 128, 128] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if labels is not None and not self.config.num_labels > 1: + raise ValueError("The number of labels should be greater than one") + + outputs = self.segformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + + logits = self.decode_head(encoder_hidden_states) + + loss = None + if labels is not None: + loss = self.hf_compute_loss(logits=logits, labels=labels) + + # make logits of shape (batch_size, num_labels, height, width) to + # keep them consistent across APIs + logits = tf.transpose(logits, perm=[0, 3, 1, 2]) + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[1:] + else: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "segformer", None) is not None: + with tf.name_scope(self.segformer.name): + self.segformer.build(None) + if getattr(self, "decode_head", None) is not None: + with tf.name_scope(self.decode_head.name): + self.decode_head.build(None) + + +__all__ = [ + "TFSegformerDecodeHead", + "TFSegformerForImageClassification", + "TFSegformerForSemanticSegmentation", + "TFSegformerModel", + "TFSegformerPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a525a70490db62f8d28587f02b60d6714490b5d4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68c7414c7bb86e2a2ff1fd55a3d80929a2703eef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5db8eecf989cc1f44be80b4a9aafc4bae0894d73 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2611d3bbcf917b376974f5dbe66842f677f2f44 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0aba93226dc854776048e1063a16d4b584af862e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/configuration_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/configuration_smolvlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1dd132e277a8b2961f000869ba4563507949f011 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/configuration_smolvlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e66626045d6dc5c8c2ed303756cb9679c18735c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/configuration_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/configuration_speecht5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dffa2013d9922bea2630a6d8aad3eb82fa000fd9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/configuration_speecht5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/feature_extraction_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/feature_extraction_speecht5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..068775f55dc2f0990b31842d765ddb67a16a6c88 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/feature_extraction_speecht5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/number_normalizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/number_normalizer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fed2966b02a77c460d0f2d8bb0f152e6cbed8b07 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/number_normalizer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/processing_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/processing_speecht5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdb7da1fd43b5bc674042807e05d62ed27e87a0b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/processing_speecht5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/tokenization_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/tokenization_speecht5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..181fffc523ee7070011331b40189a02a036a1ead Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/tokenization_speecht5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45c1aa9dfcabf1c4ed1732b8cfd476a59c46e388 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2ab8941b6a33da49130345150964a7e85aa93cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8000d47fb7a5552ac1c2624aede64171a37b838 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2190271f12d831508acc5db666935a2eaba792a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9e00687933fb0d1b972d83f2341aa0294db04c1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6349255ed3a4756aa7d449ff6af35847886dd136 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_starcoder2 import * + from .modeling_starcoder2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/configuration_starcoder2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/configuration_starcoder2.py new file mode 100644 index 0000000000000000000000000000000000000000..a700b4f4254830103c21f7920dd71dfe5066eaf8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/configuration_starcoder2.py @@ -0,0 +1,207 @@ +# coding=utf-8 +# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Starcoder2 model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Starcoder2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a + Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b](https://huggingface.co/bigcode/starcoder2-7b) model. + + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 49152): + Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Starcoder2Model`] + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 12288): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 30): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 24): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + norm_epsilon (`float`, *optional*, defaults to 1e-05): + Epsilon value for the layer norm + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + bos_token_id (`int`, *optional*, defaults to 50256): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 50256): + The id of the "end-of-sequence" token. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + sliding_window (`int`, *optional*): + Sliding window attention window size. If not specified, will default to `None` (no sliding window). + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + residual_dropout (`float`, *optional*, defaults to 0.0): + Residual connection dropout value. + embedding_dropout (`float`, *optional*, defaults to 0.0): + Embedding dropout. + use_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias term on linear layers of the model. + + + ```python + >>> from transformers import Starcoder2Model, Starcoder2Config + + >>> # Initializing a Starcoder2 7B style configuration + >>> configuration = Starcoder2Config() + + >>> # Initializing a model from the Starcoder2 7B style configuration + >>> model = Starcoder2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "starcoder2" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Starcoder2` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.c_fc": "colwise", + "layers.*.mlp.c_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=49152, + hidden_size=3072, + intermediate_size=12288, + num_hidden_layers=30, + num_attention_heads=24, + num_key_value_heads=2, + hidden_act="gelu_pytorch_tanh", + max_position_embeddings=4096, + initializer_range=0.018042, + norm_epsilon=1e-5, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + rope_theta=10000.0, + rope_scaling=None, + sliding_window=None, + attention_dropout=0.0, + residual_dropout=0.0, + embedding_dropout=0.0, + use_bias=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + self.use_bias = use_bias + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.norm_epsilon = norm_epsilon + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_dropout = attention_dropout + self.residual_dropout = residual_dropout + self.embedding_dropout = embedding_dropout + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + +__all__ = ["Starcoder2Config"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modeling_starcoder2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modeling_starcoder2.py new file mode 100644 index 0000000000000000000000000000000000000000..dfdfec22ca99cf1f205dcaa103ca448afe8498dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modeling_starcoder2.py @@ -0,0 +1,485 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/starcoder2/modular_starcoder2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_starcoder2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from transformers.utils.generic import check_model_inputs + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import ( + GenericForSequenceClassification, + GenericForTokenClassification, + GradientCheckpointingLayer, +) +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from .configuration_starcoder2 import Starcoder2Config + + +class Starcoder2MLP(nn.Module): + def __init__(self, config: Starcoder2Config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias) + self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias) + self.act = ACT2FN[config.hidden_act] + self.residual_dropout = config.residual_dropout + + def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training) + return hidden_states + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class Starcoder2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias) + self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias) + self.residual_dropout = config.residual_dropout + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=getattr(self.config, "sliding_window", None), # diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + attn_output = nn.functional.dropout( + attn_output, p=self.residual_dropout, training=self.training + ) # diff with Llama + + return attn_output, attn_weights + + +class Starcoder2DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Starcoder2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Starcoder2Attention(config=config, layer_idx=layer_idx) + self.mlp = Starcoder2MLP(config) + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class Starcoder2RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Starcoder2Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class Starcoder2PreTrainedModel(PreTrainedModel): + config: Starcoder2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Starcoder2DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Starcoder2DecoderLayer, + "attentions": Starcoder2Attention, + } + + +@auto_docstring +class Starcoder2Model(Starcoder2PreTrainedModel): + def __init__(self, config: Starcoder2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Starcoder2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + self.rotary_emb = Starcoder2RotaryEmbedding(config=config) + self.gradient_checkpointing = False + self.embedding_dropout = config.embedding_dropout + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout( + hidden_states, p=self.embedding_dropout, training=self.training + ) # main diff with Llama + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + ) + + +@auto_docstring +class Starcoder2ForCausalLM(Starcoder2PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Starcoder2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, Starcoder2ForCausalLM + + >>> model = Starcoder2ForCausalLM.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class Starcoder2ForSequenceClassification(GenericForSequenceClassification, Starcoder2PreTrainedModel): + pass + + +class Starcoder2ForTokenClassification(GenericForTokenClassification, Starcoder2PreTrainedModel): + pass + + +__all__ = [ + "Starcoder2ForCausalLM", + "Starcoder2Model", + "Starcoder2PreTrainedModel", + "Starcoder2ForSequenceClassification", + "Starcoder2ForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.py new file mode 100644 index 0000000000000000000000000000000000000000..f37c75510abea044cd800f12adfa098eb6d418b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.py @@ -0,0 +1,238 @@ +# coding=utf-8 +# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Starcoder2 model.""" + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from transformers.utils.generic import check_model_inputs + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, logging +from ...utils.deprecation import deprecate_kwarg +from ..mistral.modeling_mistral import ( + MistralAttention, + MistralDecoderLayer, + MistralForCausalLM, + MistralForSequenceClassification, + MistralForTokenClassification, + MistralModel, + MistralRotaryEmbedding, + apply_rotary_pos_emb, + eager_attention_forward, +) +from .configuration_starcoder2 import Starcoder2Config + + +logger = logging.get_logger(__name__) + + +class Starcoder2MLP(nn.Module): + def __init__(self, config: Starcoder2Config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias) + self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias) + self.act = ACT2FN[config.hidden_act] + self.residual_dropout = config.residual_dropout + + def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training) + return hidden_states + + +class Starcoder2Attention(MistralAttention): + def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None): + super().__init__(config=config, layer_idx=layer_idx) + self.residual_dropout = config.residual_dropout + self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias) + self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=getattr(self.config, "sliding_window", None), # diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + attn_output = nn.functional.dropout( + attn_output, p=self.residual_dropout, training=self.training + ) # diff with Llama + + return attn_output, attn_weights + + +class Starcoder2DecoderLayer(MistralDecoderLayer): + def __init__(self, config: Starcoder2Config, layer_idx: int): + super().__init__(config, layer_idx) + self.self_attn = Starcoder2Attention(config=config, layer_idx=layer_idx) + self.mlp = Starcoder2MLP(config) + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + + +class Starcoder2RotaryEmbedding(MistralRotaryEmbedding): + pass + + +class Starcoder2Model(MistralModel): + def __init__(self, config: Starcoder2Config): + super().__init__(config) + self.layers = nn.ModuleList( + [Starcoder2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + self.embedding_dropout = config.embedding_dropout + + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout( + hidden_states, p=self.embedding_dropout, training=self.training + ) # main diff with Llama + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + ) + + +class Starcoder2ForCausalLM(MistralForCausalLM): + pass + + +class Starcoder2ForSequenceClassification(MistralForSequenceClassification): + pass + + +class Starcoder2ForTokenClassification(MistralForTokenClassification): + pass + + +__all__ = [ + "Starcoder2ForCausalLM", + "Starcoder2Model", + "Starcoder2PreTrainedModel", # noqa: F822 + "Starcoder2ForSequenceClassification", + "Starcoder2ForTokenClassification", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da99e5539d39fac82edea3638686617083701dfc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/configuration_swiftformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/configuration_swiftformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a44f22d992eafe1f8eb28af71ed61ff77dc57b32 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/configuration_swiftformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_swiftformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_swiftformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec7ff9a4e4b7cb848a99e1382db1942df6e848b7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_swiftformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_tf_swiftformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_tf_swiftformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45fdd45532e204eddab34fd0f44bfd28146287f4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_tf_swiftformer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91ff60c37473a3e1f0cade55144a5630228f03e4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00988d9dee0a99a543f35a3b59b016b4479c7c2e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb7070e14782b961a07ebd47130cedc8c71bc671 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a87e2def120d46685693d25107e04ac0a8000860 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bc06b02db87cc9af450085d66eadb4ceaf5128e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/configuration_swin2sr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/configuration_swin2sr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..126cc37d332ecaa2ec4eb3413f49d84d6c07accb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/configuration_swin2sr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63c9c273ad4d955241a9fae02e8ac827d237dbbb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12410b574a74338b16dd30e25e475ddc8a18407d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/modeling_swin2sr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/modeling_swin2sr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61eb559a40472ac3692ab01596a4206e9fb9e130 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/modeling_swin2sr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da55b0e5a02ae76542e27d989bf0a69da04399b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..058ca7abff9e0db7df67f158134adc23d2b21731 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..671d74394c535a9dda8f18364186bdbd22179738 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40353cb1b6c2f5b50d7b1b68491448f3faf66a04 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b34b87dd1fe29f1d4349439dee1d625172cca58b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..170ca883c7e5bce660deefdef87de5529f9f6a62 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13e368498c2ba45a71b292903c99df50276005a9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/configuration_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/configuration_t5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..524653c925dc393ae72ddbc1d8b333d2eba64587 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/configuration_t5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_flax_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_flax_t5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1caea927c68c812c26facbd93773547e95678124 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_flax_t5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_tf_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_tf_t5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bec3e0daa7636cd6862dede9395cdfdd5fd09ea2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_tf_t5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5e0c6f68235d14b3c40835dc0d21736c95a4274 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..caf98dd8353c487f3a58acdd7f9cdd71d90092cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modeling_t5gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modeling_t5gemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b5eaf8cc83ea4f105924263cc4737dcad566b1d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modeling_t5gemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61a460ad62e7840fbce6dc720b8fd51c2f54e57f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b98c0e8eb9532220c01d679746f4a560b2ee8d2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e3f1da42b306da2ee87b9061c00bc290426fb87 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/configuration_timm_backbone.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/configuration_timm_backbone.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03a24b6c9f448282a4ac6228a0ae833b8c5ac85a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/configuration_timm_backbone.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/modeling_timm_backbone.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/modeling_timm_backbone.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11133600f961bc1e9f8c9f1ca87f18420c51f875 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/modeling_timm_backbone.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d004da69a57429ff1386f5effd773dfef78a9ff1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/configuration_unispeech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/configuration_unispeech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b1f973f999df2ed4b9e7250ae191259eb1644f8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/configuration_unispeech.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modeling_unispeech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modeling_unispeech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..540d5dca3c01237e7bd009ac4c2e35198633aa69 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modeling_unispeech.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modular_unispeech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modular_unispeech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d5d2c4a362cae3eecde428590db94f6887411f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modular_unispeech.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0544147da87417220aa0ee93a2a370364ade3854 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/configuration_upernet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/configuration_upernet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f58214881db9fb940cf36be4d6e7c4508ccd396 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/configuration_upernet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/modeling_upernet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/modeling_upernet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2efcfd85345eaf94c93744df9fc8aa8d37692976 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/modeling_upernet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51abf6ec5c6cbd3f8e69bd3d9921e206effd8671 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f29a6ff11f5c474a18b340726d233a23d621e503 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..344ffc8298d850b144fce18e4e003bde54d30ae8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b314828ae6ed66a28208db9cdfb694901df91ae7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/configuration_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/configuration_video_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0172734f1f1da2b184bf28eab2c5345d00d1d021 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/configuration_video_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/image_processing_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/image_processing_video_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffabae14a80da959715cf8d93aaae7b0884f6981 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/image_processing_video_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/modeling_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/modeling_video_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02141a09042ef8340a399654a1a8914fcded4c04 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/modeling_video_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/processing_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/processing_video_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fd674e88c672d61869b56d2b4318d90e3be8254 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/processing_video_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/video_processing_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/video_processing_video_llava.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb345c021b1e817d554ddb6f13388b7593ea7189 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/video_processing_video_llava.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bafa9d7455e6917d2b920db09df28f9ddb12954 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/configuration_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/configuration_vilt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bba6a019764bedd9a1971d148331bcde50d84a3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/configuration_vilt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/feature_extraction_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/feature_extraction_vilt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82308a31d82a2255f06a1503ada96231c1098ca2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/feature_extraction_vilt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8188a1cedf087967deccf8b86ada2dd2bbd4cf7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99b258fe0b534a23c52cd199f6c262bbe4b7a5ce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/modeling_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/modeling_vilt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f178f9249bed45568e97c54bb4b63cd9ec91eb5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/modeling_vilt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/processing_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/processing_vilt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53b18fae2e083a7300eba00e29f71e1059a9a057 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/processing_vilt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6ef5120c5c472c74a5818f0e6dcea460615d0c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/configuration_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/configuration_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37e924e003cded2fdd84cd6d4aa9adadb317445e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/configuration_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/feature_extraction_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/feature_extraction_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0c5551f80fcb7de8dd2c1179e14ef08a94f6bda Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/feature_extraction_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb0f13f3daa4d1632cd7aa31ab023cf2df1cce92 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a474898431eb5c82a7e86c919d9615e81ef882a9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_flax_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_flax_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6ed4031431c54bb41ab8cd8dfae14c6ccd3a9ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_flax_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_tf_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_tf_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05f0317d6cedf0cbe00b594e72433a7d75b71aa8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_tf_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e74fd0870ef2814421468daf02cf2ec15015f0ae Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_vit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6ed5d5606a37fe96bdce3b56df80f24c5fe2ee5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c02dbafa1f58bb0bd7e497c5938d3586146289f8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be7dca835849d0cfb2a550a5b78fc1aa9a75daef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fa4c53534e1d7c79e93065b530b0e4f93961533 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..534e3d2467b563bfbb0bd4ea5683431da48bd132 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/configuration_vit_msn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/configuration_vit_msn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6481ddfed3d26487e0288c2450f894d9b148265 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/configuration_vit_msn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/modeling_vit_msn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/modeling_vit_msn.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cde6a8cef0ecc42690d3a9d72920b0f8f2d2eb79 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/modeling_vit_msn.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbef8784eec76673ea922205869a5d074069e81d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f773916a30f7bc1223e159d7ddedde2f9415b87 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec7e679f736afbe19e5d7e5bd2e184193362be25 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c13e873dc165d9fe71779b4a92ac705b9cae1df Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/configuration_vitmatte.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/configuration_vitmatte.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3336f81cf88dee7d30d3f9ad697fd3bd33b9eac0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/configuration_vitmatte.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f02d995b63c617b609339ffb586e6e233871dd55 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7053ad4179def53f8c89be228a5dc3d8f8604167 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/modeling_vitmatte.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/modeling_vitmatte.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67a9a41cfd918d5e3a129e9dc0e98264708e4963 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/modeling_vitmatte.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..687edfebb4aee98c87e59ee596197ae22e9da411 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/configuration_vits.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/configuration_vits.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7fc602f3144f002294eed87336499669c2c4e4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/configuration_vits.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/modeling_vits.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/modeling_vits.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3ca9cae3dda1f80e8ad4441c3a26913689498ab Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/modeling_vits.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/tokenization_vits.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/tokenization_vits.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1845d1a515a0960cbaec9a58d4f2d1834fc2f419 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/tokenization_vits.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/configuration_vjepa2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/configuration_vjepa2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c95ff0ff28cda94d71305349b58e7b7ff87f9826 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/configuration_vjepa2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/modeling_vjepa2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/modeling_vjepa2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..039c922db27a9bf0db05722fadd7791caefda4b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/modeling_vjepa2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db989ad98d208354153506c66da7a6b6952a34b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/configuration_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/configuration_voxtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..713a494f071a1d335c5963ccf7a708d4f2eee675 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/configuration_voxtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modeling_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modeling_voxtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4e16395c4c630c24029460677750d56f5c8dc66 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modeling_voxtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modular_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modular_voxtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2aac3f17ef35e30a3139953c0bfcacd3eb7f6fd8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modular_voxtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/processing_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/processing_voxtral.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d37ca5fc4d752c3b3694f20596232c3400e43050 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/processing_voxtral.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d76fc2067b5f907dcfee64bd1106bb7ea117bfdc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b476a79d3041867ecbee5714e5e71fe7ca8bc1e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b49ccf49838e11150e063ab99d1910403d0ff3f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e94aa079a9c6b67836de882b002b0f22a54c98b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26e0db53d6fc11aeccec22fa576122cd4ec1d555 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a0d809ec64758c3f8d41d06743880009b027cfa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6545de269255aeda4ab389232248a6d17c2d5bd9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..54d0d9e2c3997c4b2865b5a47e7b64c39ed40b68 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_wav2vec2_conformer import * + from .modeling_wav2vec2_conformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..493e96f44037d6b846ad702e0c765335d3ecee08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py @@ -0,0 +1,360 @@ +# coding=utf-8 +# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Wav2Vec2Conformer model configuration""" + +import functools +import operator + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Wav2Vec2ConformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Wav2Vec2ConformerModel`]. It is used to + instantiate an Wav2Vec2Conformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Conformer + [facebook/wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*): + Vocabulary size of the Wav2Vec2Conformer model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`Wav2Vec2ConformerModel`]. Vocabulary size of the + model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward + method of [`Wav2Vec2ConformerModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for activations inside the fully connected layer. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + final_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the final projection layer of [`Wav2Vec2ConformerForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more + details. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + feat_extract_norm (`str`, *optional*, defaults to `"group"`): + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group + normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D + convolutional layers. + feat_proj_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for output of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): + The non-linear activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for quantized feature encoder states. + conv_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, *optional*, defaults to `False`): + Whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, *optional*, defaults to 128): + Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): + Number of groups of 1D convolutional positional embeddings layer. + apply_spec_augment (`bool`, *optional*, defaults to `True`): + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://huggingface.co/papers/1904.08779). + mask_time_prob (`float`, *optional*, defaults to 0.05): + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the probability of each feature vector to be chosen as the start of the vector span to be + masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the + actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, *optional*, defaults to 10): + Length of vector span along the time axis. + mask_time_min_masks (`int`, *optional*, defaults to 2),: + The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, *optional*, defaults to 0.0): + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector + span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap + may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is + True`. + mask_feature_length (`int`, *optional*, defaults to 10): + Length of vector span along the feature axis. + mask_feature_min_masks (`int`, *optional*, defaults to 0),: + The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time + step, irrespectively of `mask_feature_prob`. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' + num_codevectors_per_group (`int`, *optional*, defaults to 320): + Number of entries in each quantization codebook (group). + num_codevector_groups (`int`, *optional*, defaults to 2): + Number of codevector groups for product codevector quantization. + contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): + The temperature *kappa* in the contrastive loss. + feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for the output of the feature encoder that's used by the quantizer. + num_negatives (`int`, *optional*, defaults to 100): + Number of negative samples for the contrastive loss. + codevector_dim (`int`, *optional*, defaults to 256): + Dimensionality of the quantized feature vectors. + proj_codevector_dim (`int`, *optional*, defaults to 256): + Dimensionality of the final projection of both the quantized and the transformer features. + diversity_loss_weight (`int`, *optional*, defaults to 0.1): + The weight of the codebook diversity loss component. + ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`Wav2Vec2ConformerForCTC`]. + ctc_zero_infinity (`bool`, *optional*, defaults to `False`): + Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`Wav2Vec2ConformerForCTC`]. + use_weighted_layer_sum (`bool`, *optional*, defaults to `False`): + Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an + instance of [`Wav2Vec2ConformerForSequenceClassification`]. + classifier_proj_size (`int`, *optional*, defaults to 256): + Dimensionality of the projection before token mean-pooling for classification. + tdnn_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`): + A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* + module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. + tdnn_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the + *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. + tdnn_dilation (`tuple[int]` or `list[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`): + A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the + *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. + xvector_output_dim (`int`, *optional*, defaults to 512): + Dimensionality of the *XVector* embedding vectors. + add_adapter (`bool`, *optional*, defaults to `False`): + Whether a convolutional network should be stacked on top of the Wav2Vec2Conformer Encoder. Can be very + useful for warm-starting Wav2Vec2Conformer for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, *optional*, defaults to 3): + Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, *optional*, defaults to 2): + Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, *optional*, defaults to 3): + Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + output_hidden_size (`int`, *optional*): + Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + position_embeddings_type (`str`, *optional*, defaults to `"relative"`): + Can be specified to `relative` or `rotary` for relative or rotary position embeddings respectively. If left + `None` no relative position embedding is applied. + rotary_embedding_base (`int`, *optional*, defaults to 10000): + If `"rotary"` position embeddings are used, defines the size of the embedding base. + max_source_positions (`int`, *optional*, defaults to 5000): + if `"relative"` position embeddings are used, defines the maximum source input positions. + conv_depthwise_kernel_size (`int`, *optional*, defaults to 31): + Kernel size of convolutional depthwise 1D layer in Conformer blocks. + conformer_conv_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all convolutional layers in Conformer blocks. + + Example: + + ```python + >>> from transformers import Wav2Vec2ConformerConfig, Wav2Vec2ConformerModel + + >>> # Initializing a Wav2Vec2Conformer facebook/wav2vec2-conformer-rel-pos-large style configuration + >>> configuration = Wav2Vec2ConformerConfig() + + >>> # Initializing a model (with random weights) from the facebook/wav2vec2-conformer-rel-pos-large style configuration + >>> model = Wav2Vec2ConformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "wav2vec2-conformer" + + def __init__( + self, + vocab_size=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout=0.1, + activation_dropout=0.1, + attention_dropout=0.1, + feat_proj_dropout=0.0, + feat_quantizer_dropout=0.0, + final_dropout=0.1, + layerdrop=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + feat_extract_norm="group", + feat_extract_activation="gelu", + conv_dim=(512, 512, 512, 512, 512, 512, 512), + conv_stride=(5, 2, 2, 2, 2, 2, 2), + conv_kernel=(10, 3, 3, 3, 3, 2, 2), + conv_bias=False, + num_conv_pos_embeddings=128, + num_conv_pos_embedding_groups=16, + apply_spec_augment=True, + mask_time_prob=0.05, + mask_time_length=10, + mask_time_min_masks=2, + mask_feature_prob=0.0, + mask_feature_length=10, + mask_feature_min_masks=0, + num_codevectors_per_group=320, + num_codevector_groups=2, + contrastive_logits_temperature=0.1, + num_negatives=100, + codevector_dim=256, + proj_codevector_dim=256, + diversity_loss_weight=0.1, + ctc_loss_reduction="sum", + ctc_zero_infinity=False, + use_weighted_layer_sum=False, + classifier_proj_size=256, + tdnn_dim=(512, 512, 512, 512, 1500), + tdnn_kernel=(5, 3, 3, 1, 1), + tdnn_dilation=(1, 2, 3, 1, 1), + xvector_output_dim=512, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + add_adapter=False, + adapter_kernel_size=3, + adapter_stride=2, + num_adapter_layers=3, + output_hidden_size=None, + position_embeddings_type="relative", + rotary_embedding_base=10000, + max_source_positions=5000, + conv_depthwise_kernel_size=31, + conformer_conv_dropout=0.1, + **kwargs, + ): + super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) + self.hidden_size = hidden_size + self.feat_extract_norm = feat_extract_norm + self.feat_extract_activation = feat_extract_activation + self.conv_dim = list(conv_dim) + self.conv_stride = list(conv_stride) + self.conv_kernel = list(conv_kernel) + self.conv_bias = conv_bias + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.num_feat_extract_layers = len(self.conv_dim) + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_attention_heads = num_attention_heads + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.feat_proj_dropout = feat_proj_dropout + self.final_dropout = final_dropout + self.layerdrop = layerdrop + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.use_weighted_layer_sum = use_weighted_layer_sum + self.max_source_positions = max_source_positions + self.position_embeddings_type = position_embeddings_type + self.rotary_embedding_base = rotary_embedding_base + + if ( + (len(self.conv_stride) != self.num_feat_extract_layers) + or (len(self.conv_kernel) != self.num_feat_extract_layers) + or (len(self.conv_dim) != self.num_feat_extract_layers) + ): + raise ValueError( + "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` ==" + " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) =" + f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`," + f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." + ) + + # Conformer-block related + self.conv_depthwise_kernel_size = conv_depthwise_kernel_size + self.conformer_conv_dropout = conformer_conv_dropout + + # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 + self.apply_spec_augment = apply_spec_augment + self.mask_time_prob = mask_time_prob + self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks + self.mask_feature_prob = mask_feature_prob + self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks + + # parameters for pretraining with codevector quantized representations + self.num_codevectors_per_group = num_codevectors_per_group + self.num_codevector_groups = num_codevector_groups + self.contrastive_logits_temperature = contrastive_logits_temperature + self.feat_quantizer_dropout = feat_quantizer_dropout + self.num_negatives = num_negatives + self.codevector_dim = codevector_dim + self.proj_codevector_dim = proj_codevector_dim + self.diversity_loss_weight = diversity_loss_weight + + # ctc loss + self.ctc_loss_reduction = ctc_loss_reduction + self.ctc_zero_infinity = ctc_zero_infinity + + # adapter + self.add_adapter = add_adapter + self.adapter_kernel_size = adapter_kernel_size + self.adapter_stride = adapter_stride + self.num_adapter_layers = num_adapter_layers + self.output_hidden_size = output_hidden_size or hidden_size + + # SequenceClassification-specific parameter. Feel free to ignore for other classes. + self.classifier_proj_size = classifier_proj_size + + # XVector-specific parameters. Feel free to ignore for other classes. + self.tdnn_dim = list(tdnn_dim) + self.tdnn_kernel = list(tdnn_kernel) + self.tdnn_dilation = list(tdnn_dilation) + self.xvector_output_dim = xvector_output_dim + + @property + def inputs_to_logits_ratio(self): + return functools.reduce(operator.mul, self.conv_stride, 1) + + +__all__ = ["Wav2Vec2ConformerConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..62357c8e0dcb34bf597e414c567491b38fff8e1d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -0,0 +1,1928 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_wav2vec2_conformer.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + CausalLMOutput, + SequenceClassifierOutput, + TokenClassifierOutput, + Wav2Vec2BaseModelOutput, + XVectorOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, is_peft_available +from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions. + """ +) +class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official + paper](https://huggingface.co/papers/2006.11477). + projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked + projected quantized states. + projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive + target vectors for contrastive loss. + codevector_perplexity (`torch.FloatTensor` of shape `(1,)`): + The perplexity of the codevector distribution, used to measure the diversity of the codebook. + contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The contrastive loss (L_m) as stated in the [official paper](https://huggingface.co/papers/2006.11477). + diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The diversity loss (L_d) as stated in the [official paper](https://huggingface.co/papers/2006.11477). + """ + + loss: Optional[torch.FloatTensor] = None + projected_states: Optional[torch.FloatTensor] = None + projected_quantized_states: Optional[torch.FloatTensor] = None + codevector_perplexity: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + contrastive_loss: Optional[torch.FloatTensor] = None + diversity_loss: Optional[torch.FloatTensor] = None + + +class Wav2Vec2ConformerSamePadLayer(nn.Module): + def __init__(self, num_conv_pos_embeddings): + super().__init__() + self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 + + def forward(self, hidden_states): + if self.num_pad_remove > 0: + hidden_states = hidden_states[:, :, : -self.num_pad_remove] + return hidden_states + + +class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=config.num_conv_pos_embeddings, + padding=config.num_conv_pos_embeddings // 2, + groups=config.num_conv_pos_embedding_groups, + ) + + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): + self.conv = weight_norm(self.conv, name="weight", dim=2) + if hasattr(self.conv, "parametrizations"): + weight_g = self.conv.parametrizations.weight.original0 + weight_v = self.conv.parametrizations.weight.original1 + else: + weight_g = self.conv.weight_g + weight_v = self.conv.weight_v + deepspeed.zero.register_external_parameter(self, weight_v) + deepspeed.zero.register_external_parameter(self, weight_g) + else: + self.conv = weight_norm(self.conv, name="weight", dim=2) + + self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + + hidden_states = self.conv(hidden_states) + hidden_states = self.padding(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module): + """Rotary positional embedding + Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864 + """ + + def __init__(self, config): + super().__init__() + dim = config.hidden_size // config.num_attention_heads + base = config.rotary_embedding_base + + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + self.cached_sequence_length = None + self.cached_rotary_positional_embedding = None + + def forward(self, hidden_states): + sequence_length = hidden_states.shape[1] + + if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None: + return self.cached_rotary_positional_embedding + + self.cached_sequence_length = sequence_length + # Embeddings are computed in the dtype of the inv_freq constant + time_stamps = torch.arange(sequence_length).type_as(self.inv_freq) + freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq) + embeddings = torch.cat((freqs, freqs), dim=-1) + + cos_embeddings = embeddings.cos()[:, None, None, :] + sin_embeddings = embeddings.sin()[:, None, None, :] + # Computed embeddings are cast to the dtype of the hidden state inputs + self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states) + return self.cached_rotary_positional_embedding + + +class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module): + """Relative positional encoding module.""" + + def __init__(self, config): + super().__init__() + self.max_len = config.max_source_positions + self.d_model = config.hidden_size + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pe(self, x): + # Reset the positional encodings + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` is the position of query vector and `j` is the + # position of key vector. We use positive relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2ConformerLayerNormConvLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + + hidden_states = hidden_states.transpose(-2, -1) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose(-2, -1) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2ConformerGroupNormConvLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1d( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2ConformerFeatureEncoder(nn.Module): + """Construct the features from raw audio waveform""" + + def __init__(self, config): + super().__init__() + + if config.feat_extract_norm == "group": + conv_layers = [Wav2Vec2ConformerGroupNormConvLayer(config, layer_id=0)] + [ + Wav2Vec2ConformerNoLayerNormConvLayer(config, layer_id=i + 1) + for i in range(config.num_feat_extract_layers - 1) + ] + elif config.feat_extract_norm == "layer": + conv_layers = [ + Wav2Vec2ConformerLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers) + ] + else: + raise ValueError( + f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']" + ) + self.conv_layers = nn.ModuleList(conv_layers) + self.gradient_checkpointing = False + self._requires_grad = True + + def _freeze_parameters(self): + for param in self.parameters(): + param.requires_grad = False + self._requires_grad = False + + def forward(self, input_values): + hidden_states = input_values[:, None] + + # make sure hidden_states require grad for gradient_checkpointing + if self._requires_grad and self.training: + hidden_states.requires_grad = True + + for conv_layer in self.conv_layers: + hidden_states = conv_layer(hidden_states) + + return hidden_states + + +class Wav2Vec2ConformerFeatureProjection(nn.Module): + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) + self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) + self.dropout = nn.Dropout(config.feat_proj_dropout) + + def forward(self, hidden_states): + # non-projected hidden states are needed for quantization + norm_hidden_states = self.layer_norm(hidden_states) + hidden_states = self.projection(norm_hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states, norm_hidden_states + + +class Wav2Vec2ConformerFeedForward(nn.Module): + def __init__(self, config): + super().__init__() + self.intermediate_dropout = nn.Dropout(config.activation_dropout) + + self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.output_dropout = nn.Dropout(config.hidden_dropout) + + def forward(self, hidden_states): + hidden_states = self.intermediate_dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.intermediate_dropout(hidden_states) + + hidden_states = self.output_dense(hidden_states) + hidden_states = self.output_dropout(hidden_states) + return hidden_states + + +class Wav2Vec2ConformerConvolutionModule(nn.Module): + """Convolution block used in the conformer block""" + + def __init__(self, config): + super().__init__() + if (config.conv_depthwise_kernel_size - 1) % 2 == 1: + raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding") + self.layer_norm = nn.LayerNorm(config.hidden_size) + self.pointwise_conv1 = nn.Conv1d( + config.hidden_size, + 2 * config.hidden_size, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.glu = nn.GLU(dim=1) + self.depthwise_conv = nn.Conv1d( + config.hidden_size, + config.hidden_size, + config.conv_depthwise_kernel_size, + stride=1, + padding=(config.conv_depthwise_kernel_size - 1) // 2, + groups=config.hidden_size, + bias=False, + ) + self.batch_norm = nn.BatchNorm1d(config.hidden_size) + self.activation = ACT2FN[config.hidden_act] + self.pointwise_conv2 = nn.Conv1d( + config.hidden_size, + config.hidden_size, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.dropout = nn.Dropout(config.conformer_conv_dropout) + + def forward(self, hidden_states): + hidden_states = self.layer_norm(hidden_states) + # exchange the temporal dimension and the feature dimension + hidden_states = hidden_states.transpose(1, 2) + + # GLU mechanism + # => (batch, 2*channel, dim) + hidden_states = self.pointwise_conv1(hidden_states) + # => (batch, channel, dim) + hidden_states = self.glu(hidden_states) + + # 1D Depthwise Conv + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = self.pointwise_conv2(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2ConformerSelfAttention(nn.Module): + """Construct an Wav2Vec2ConformerSelfAttention object. + Can be enhanced with rotary or relative position embeddings. + """ + + def __init__(self, config): + super().__init__() + + self.head_size = config.hidden_size // config.num_attention_heads + self.num_heads = config.num_attention_heads + self.position_embeddings_type = config.position_embeddings_type + + self.linear_q = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_k = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_v = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_out = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(p=config.attention_dropout) + + if self.position_embeddings_type == "relative": + # linear transformation for positional encoding + self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size)) + self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size)) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + relative_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + # self-attention mechanism + batch_size, sequence_length, hidden_size = hidden_states.size() + + # make sure query/key states can be != value states + query_key_states = hidden_states + value_states = hidden_states + + if self.position_embeddings_type == "rotary": + if relative_position_embeddings is None: + raise ValueError( + "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'" + ) + query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings) + + # project query_key_states and value_states + query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size) + + # => (batch, head, time1, d_k) + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + + if self.position_embeddings_type == "relative": + if relative_position_embeddings is None: + raise ValueError( + "`relative_position_embeddings` has to be defined when `self.position_embeddings_type ==" + " 'relative'" + ) + # apply relative_position_embeddings to qk scores + # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860 + scores = self._apply_relative_embeddings( + query=query, key=key, relative_position_embeddings=relative_position_embeddings + ) + else: + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size) + + # apply attention_mask if necessary + if attention_mask is not None: + scores = scores + attention_mask + + # => (batch, head, time1, time2) + probs = torch.softmax(scores, dim=-1) + probs = self.dropout(probs) + + # => (batch, head, time1, d_k) + hidden_states = torch.matmul(probs, value) + + # => (batch, time1, hidden_size) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size) + hidden_states = self.linear_out(hidden_states) + + return hidden_states, probs + + def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings): + batch_size, sequence_length, hidden_size = hidden_states.size() + hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size) + + cos = relative_position_embeddings[0, :sequence_length, ...] + sin = relative_position_embeddings[1, :sequence_length, ...] + + # rotate hidden_states with rotary embeddings + hidden_states = hidden_states.transpose(0, 1) + rotated_states_begin = hidden_states[..., : self.head_size // 2] + rotated_states_end = hidden_states[..., self.head_size // 2 :] + rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1) + hidden_states = (hidden_states * cos) + (rotated_states * sin) + hidden_states = hidden_states.transpose(0, 1) + + hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size) + + return hidden_states + + def _apply_relative_embeddings(self, query, key, relative_position_embeddings): + # 1. project positional embeddings + # => (batch, head, 2*time1-1, d_k) + proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings) + proj_relative_position_embeddings = proj_relative_position_embeddings.view( + relative_position_embeddings.size(0), -1, self.num_heads, self.head_size + ) + proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2) + proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3) + + # 2. Add bias to query + # => (batch, head, time1, d_k) + query = query.transpose(1, 2) + q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2) + q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2) + + # 3. attention score: first compute matrix a and matrix c + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + # => (batch, head, time1, time2) + scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1)) + + # 4. then compute matrix b and matrix d + # => (batch, head, time1, 2*time1-1) + scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings) + + # 5. shift matrix b and matrix d + zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype) + scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1) + scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2]) + scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape) + scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd) + scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1] + + # 6. sum matrices + # => (batch, head, time1, time2) + scores = (scores_ac + scores_bd) / math.sqrt(self.head_size) + + return scores + + +class Wav2Vec2ConformerEncoderLayer(GradientCheckpointingLayer): + """Conformer block based on https://huggingface.co/papers/2005.08100.""" + + def __init__(self, config): + super().__init__() + embed_dim = config.hidden_size + dropout = config.attention_dropout + + # Feed-forward 1 + self.ffn1_layer_norm = nn.LayerNorm(embed_dim) + self.ffn1 = Wav2Vec2ConformerFeedForward(config) + + # Self-Attention + self.self_attn_layer_norm = nn.LayerNorm(embed_dim) + self.self_attn_dropout = nn.Dropout(dropout) + self.self_attn = Wav2Vec2ConformerSelfAttention(config) + + # Conformer Convolution + self.conv_module = Wav2Vec2ConformerConvolutionModule(config) + + # Feed-forward 2 + self.ffn2_layer_norm = nn.LayerNorm(embed_dim) + self.ffn2 = Wav2Vec2ConformerFeedForward(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + + def forward( + self, + hidden_states, + attention_mask: Optional[torch.Tensor] = None, + relative_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + # 1. Feed-Forward 1 layer + residual = hidden_states + hidden_states = self.ffn1_layer_norm(hidden_states) + hidden_states = self.ffn1(hidden_states) + hidden_states = hidden_states * 0.5 + residual + residual = hidden_states + + # 2. Self-Attention layer + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weigts = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + relative_position_embeddings=relative_position_embeddings, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_dropout(hidden_states) + hidden_states = hidden_states + residual + + # 3. Convolutional Layer + residual = hidden_states + hidden_states = self.conv_module(hidden_states) + hidden_states = residual + hidden_states + + # 4. Feed-Forward 2 Layer + residual = hidden_states + hidden_states = self.ffn2_layer_norm(hidden_states) + hidden_states = self.ffn2(hidden_states) + hidden_states = hidden_states * 0.5 + residual + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, attn_weigts + + +class Wav2Vec2ConformerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + if config.position_embeddings_type == "relative": + self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config) + elif config.position_embeddings_type == "rotary": + self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config) + else: + self.embed_positions = None + + self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens output 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0.0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + hidden_states = self.dropout(hidden_states) + + if self.embed_positions is not None: + relative_position_embeddings = self.embed_positions(hidden_states) + else: + relative_position_embeddings = None + + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.config.layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + relative_position_embeddings=relative_position_embeddings, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module): + """ + Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH + GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information. + """ + + def __init__(self, config): + super().__init__() + self.num_groups = config.num_codevector_groups + self.num_vars = config.num_codevectors_per_group + + if config.codevector_dim % self.num_groups != 0: + raise ValueError( + f"`config.codevector_dim {config.codevector_dim} must be divisible " + f"by `config.num_codevector_groups` {self.num_groups} for concatenation" + ) + + # storage for codebook variables (codewords) + self.codevectors = nn.Parameter( + torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups) + ) + self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) + + # can be decayed for training + self.temperature = 2 + + @staticmethod + def _compute_perplexity(probs, mask=None): + if mask is not None: + mask_extended = mask.flatten()[:, None, None].expand(probs.shape) + probs = torch.where(mask_extended, probs, torch.zeros_like(probs)) + marginal_probs = probs.sum(dim=0) / mask.sum() + else: + marginal_probs = probs.mean(dim=0) + + perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum() + return perplexity + + def forward(self, hidden_states, mask_time_indices=None): + batch_size, sequence_length, hidden_size = hidden_states.shape + + # project to codevector dim + hidden_states = self.weight_proj(hidden_states) + hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1) + + if self.training: + # sample code vector probs via gumbel in differentiateable way + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True + ).type_as(hidden_states) + + # compute perplexity + codevector_soft_dist = torch.softmax( + hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1 + ) + perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices) + else: + # take argmax in non-differentiable way + # comptute hard codevector distribution (one hot) + codevector_idx = hidden_states.argmax(dim=-1) + codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_( + -1, codevector_idx.view(-1, 1), 1.0 + ) + codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1) + + perplexity = self._compute_perplexity(codevector_probs, mask_time_indices) + + codevector_probs = codevector_probs.view(batch_size * sequence_length, -1) + # use probs to retrieve codevectors + codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors + codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1) + codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1) + + return codevectors, perplexity + + +class Wav2Vec2ConformerAdapter(nn.Module): + def __init__(self, config): + super().__init__() + + # feature dim might need to be down-projected + if config.output_hidden_size != config.hidden_size: + self.proj = nn.Linear(config.hidden_size, config.output_hidden_size) + self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size) + else: + self.proj = self.proj_layer_norm = None + + self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)) + self.layerdrop = config.layerdrop + + def forward(self, hidden_states): + # down project hidden_states if necessary + if self.proj is not None and self.proj_layer_norm is not None: + hidden_states = self.proj(hidden_states) + hidden_states = self.proj_layer_norm(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + + for layer in self.layers: + layerdrop_prob = np.random.random() + if not self.training or (layerdrop_prob > self.layerdrop): + hidden_states = layer(hidden_states) + + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2ConformerAdapterLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1d( + config.output_hidden_size, + 2 * config.output_hidden_size, + config.adapter_kernel_size, + stride=config.adapter_stride, + padding=1, + ) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = nn.functional.glu(hidden_states, dim=1) + + return hidden_states + + +@auto_docstring +class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): + config: Wav2Vec2ConformerConfig + base_model_prefix = "wav2vec2_conformer" + main_input_name = "input_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init. + if isinstance(module, Wav2Vec2ConformerForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + # gumbel softmax requires special init + elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer): + module.weight_proj.weight.data.normal_(mean=0.0, std=1) + module.weight_proj.bias.data.zero_() + nn.init.uniform_(module.codevectors) + elif isinstance(module, Wav2Vec2ConformerSelfAttention): + if hasattr(module, "pos_bias_u"): + nn.init.xavier_uniform_(module.pos_bias_u) + if hasattr(module, "pos_bias_v"): + nn.init.xavier_uniform_(module.pos_bias_v) + elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, Wav2Vec2ConformerFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + def _get_feat_extract_output_lengths( + self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None + ): + """ + Computes the output length of the convolutional layers + """ + + add_adapter = self.config.add_adapter if add_adapter is None else add_adapter + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 + + for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): + input_lengths = _conv_out_length(input_lengths, kernel_size, stride) + + if add_adapter: + for _ in range(self.config.num_adapter_layers): + input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride) + + return input_lengths + + def _get_feature_vector_attention_mask( + self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None + ): + # Effectively attention_mask.sum(-1), but not inplace to be able to run + # on inference mode. + non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1] + + output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) + output_lengths = output_lengths.to(torch.long) + + batch_size = attention_mask.shape[0] + + attention_mask = torch.zeros( + (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device + ) + # these two operations makes sure that all values before the output lengths idxs are attended to + attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + return attention_mask + + +def _compute_mask_indices( + shape: tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[torch.LongTensor] = None, + min_masks: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for + ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on + CPU as part of the preprocessing during training. + + Args: + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. + mask_length: size of the mask + min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. + """ + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" + ) + + # epsilon is used for probabilistic rounding + epsilon = np.random.rand(1).item() + + def compute_num_masked_span(input_length): + """Given input length, compute how many spans should be masked""" + num_masked_span = int(mask_prob * input_length / mask_length + epsilon) + num_masked_span = max(num_masked_span, min_masks) + + # make sure num masked span <= sequence_length + if num_masked_span * mask_length > sequence_length: + num_masked_span = sequence_length // mask_length + + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + + return num_masked_span + + # compute number of masked spans in batch + input_lengths = ( + attention_mask.detach().sum(-1).tolist() + if attention_mask is not None + else [sequence_length for _ in range(batch_size)] + ) + + # SpecAugment mask to fill + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool) + spec_aug_mask_idxs = [] + + max_num_masked_span = compute_num_masked_span(sequence_length) + + if max_num_masked_span == 0: + return spec_aug_mask + + for input_length in input_lengths: + # compute num of masked spans for this input + num_masked_span = compute_num_masked_span(input_length) + + # get random indices to mask + spec_aug_mask_idx = np.random.choice( + np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + ) + + # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] + + spec_aug_mask_idx = np.concatenate( + [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] + ) + spec_aug_mask_idxs.append(spec_aug_mask_idx) + + spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) + + # expand masked indices to masked spans + spec_aug_mask_idxs = np.broadcast_to( + spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + + # add offset to the starting indexes so that indexes now create a span + offsets = np.arange(mask_length)[None, None, :] + offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( + batch_size, max_num_masked_span * mask_length + ) + spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + + # scatter indices to mask + np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) + + return spec_aug_mask + + +Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput + + +@auto_docstring +class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel): + def __init__(self, config: Wav2Vec2ConformerConfig): + super().__init__(config) + self.config = config + self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config) + self.feature_projection = Wav2Vec2ConformerFeatureProjection(config) + + # model only needs masking vector if mask prob is > 0.0 + if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: + self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_()) + + self.encoder = Wav2Vec2ConformerEncoder(config) + + self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.feature_extractor._freeze_parameters() + + def _mask_hidden_states( + self, + hidden_states: torch.FloatTensor, + mask_time_indices: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + ): + """ + Masks extracted features along time axis and/or along feature axis according to + [SpecAugment](https://huggingface.co/papers/1904.08779). + """ + + # `config.apply_spec_augment` can set masking to False + if not getattr(self.config, "apply_spec_augment", True): + return hidden_states + + # generate indices & apply SpecAugment along time axis + batch_size, sequence_length, hidden_size = hidden_states.size() + + if mask_time_indices is not None: + # apply SpecAugment along time axis with given mask_time_indices + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + elif self.config.mask_time_prob > 0 and self.training: + mask_time_indices = _compute_mask_indices( + (batch_size, sequence_length), + mask_prob=self.config.mask_time_prob, + mask_length=self.config.mask_time_length, + attention_mask=attention_mask, + min_masks=self.config.mask_time_min_masks, + ) + mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + + if self.config.mask_feature_prob > 0 and self.training: + # generate indices & apply SpecAugment along feature axis + mask_feature_indices = _compute_mask_indices( + (batch_size, hidden_size), + mask_prob=self.config.mask_feature_prob, + mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, + ) + mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) + mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + hidden_states[mask_feature_indices] = 0 + + return hidden_states + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, Wav2Vec2ConformerBaseModelOutput]: + r""" + mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + extract_features = self.feature_extractor(input_values) + extract_features = extract_features.transpose(1, 2) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + + hidden_states, extract_features = self.feature_projection(extract_features) + hidden_states = self._mask_hidden_states( + hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + if self.adapter is not None: + hidden_states = self.adapter(hidden_states) + + if not return_dict: + return (hidden_states, extract_features) + encoder_outputs[1:] + + return Wav2Vec2ConformerBaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Wav2Vec2Conformer Model with a quantizer and `VQ` head on top. + """ +) +class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel): + def __init__(self, config: Wav2Vec2ConformerConfig): + super().__init__(config) + self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) + self.dropout_features = nn.Dropout(config.feat_quantizer_dropout) + + self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config) + + self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim) + self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim) + + # Initialize weights and apply final processing + self.post_init() + + def set_gumbel_temperature(self, temperature: int): + """ + Set the Gumbel softmax temperature to a given value. Only necessary for training + """ + self.quantizer.temperature = temperature + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2_conformer.feature_extractor._freeze_parameters() + + @staticmethod + def compute_contrastive_logits( + target_features: torch.FloatTensor, + negative_features: torch.FloatTensor, + predicted_features: torch.FloatTensor, + temperature: int = 0.1, + ): + """ + Compute logits for contrastive loss based using cosine similarity as the distance measure between + `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied. + """ + target_features = torch.cat([target_features, negative_features], dim=0) + + logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as( + target_features + ) + + # apply temperature + logits = logits / temperature + return logits + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.BoolTensor] = None, + sampled_negative_indices: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, Wav2Vec2ConformerForPreTrainingOutput]: + r""" + mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict + masked extracted features in *config.proj_codevector_dim* space. + sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*): + Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss. + Required input for pre-training. + + Example: + + ```python + >>> import torch + >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining + >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices + >>> from datasets import load_dataset + + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2_conformer-base") + >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2_conformer-base") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 + + >>> # compute masked indices + >>> batch_size, raw_sequence_length = input_values.shape + >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item() + >>> mask_time_indices = _compute_mask_indices( + ... shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2 + ... ) + >>> sampled_negative_indices = _sample_negative_indices( + ... features_shape=(batch_size, sequence_length), + ... num_negatives=model.config.num_negatives, + ... mask_time_indices=mask_time_indices, + ... ) + >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long) + >>> sampled_negative_indices = torch.tensor( + ... data=sampled_negative_indices, device=input_values.device, dtype=torch.long + ... ) + + >>> with torch.no_grad(): + ... outputs = model(input_values, mask_time_indices=mask_time_indices) + + >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) + >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) + + >>> # show that cosine similarity is much higher than random + >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5 + tensor(True) + + >>> # for contrastive loss training model should be put into train mode + >>> model = model.train() + >>> loss = model( + ... input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices + ... ).loss + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if mask_time_indices is not None: + mask_time_indices = mask_time_indices.to(torch.bool) + + outputs = self.wav2vec2_conformer( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + mask_time_indices=mask_time_indices, + return_dict=return_dict, + ) + + # 1. project all transformed features (including masked) to final vq dim + transformer_features = self.project_hid(outputs[0]) + + # 2. quantize all (unmasked) extracted features and project to final vq dim + extract_features = self.dropout_features(outputs[1]) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + + quantized_features, codevector_perplexity = self.quantizer( + extract_features, mask_time_indices=mask_time_indices + ) + + quantized_features = quantized_features.to(self.project_q.weight.dtype) + quantized_features = self.project_q(quantized_features) + + loss = contrastive_loss = diversity_loss = None + if sampled_negative_indices is not None: + batch_size, sequence_length, hidden_size = quantized_features.shape + + # for training, we sample negatives + # 3. sample K negatives (distractors) quantized states for contrastive loss + # if attention_mask is passed, make sure that padded feature vectors cannot be sampled + # sample negative quantized vectors BTC => (BxT)C + negative_quantized_features = quantized_features.view(-1, hidden_size)[ + sampled_negative_indices.long().view(-1) + ] + negative_quantized_features = negative_quantized_features.view( + batch_size, sequence_length, -1, hidden_size + ).permute(2, 0, 1, 3) + + # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa` + # of equation (3) in https://huggingface.co/papers/2006.11477 + logits = self.compute_contrastive_logits( + quantized_features[None, :], + negative_quantized_features, + transformer_features, + self.config.contrastive_logits_temperature, + ) + + # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low), + # its cosine similarity will be masked + neg_is_pos = (quantized_features == negative_quantized_features).all(-1) + + if neg_is_pos.any(): + logits[1:][neg_is_pos] = float("-inf") + + # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) = + # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa)) + logits = logits.transpose(0, 2).reshape(-1, logits.size(0)) + target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten() + + contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum") + # 7. compute diversity loss: \mathbf{L}_d + num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups + diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum() + + # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d + loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss + + if not return_dict: + if loss is not None: + return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:] + + return Wav2Vec2ConformerForPreTrainingOutput( + loss=loss, + projected_states=transformer_features, + projected_quantized_states=quantized_features, + codevector_perplexity=codevector_perplexity, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + contrastive_loss=contrastive_loss, + diversity_loss=diversity_loss, + ) + + +_HIDDEN_STATES_START_POSITION = 2 + + +@auto_docstring( + custom_intro=""" + Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + """ +) +class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel): + def __init__(self, config, target_lang: Optional[str] = None): + r""" + target_lang (`str`, *optional*): + Language id of adapter weights. Adapter weights are stored in the format adapter..safetensors or + adapter..bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses 'eng' by + default. + """ + super().__init__(config) + + self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) + self.dropout = nn.Dropout(config.final_dropout) + + self.target_lang = target_lang + + if config.vocab_size is None: + raise ValueError( + f"You are trying to instantiate {self.__class__} with a configuration that " + "does not define the vocabulary size of the language model head. Please " + "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. " + "or define `vocab_size` of your model's configuration." + ) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2_conformer.feature_extractor._freeze_parameters() + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple, CausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*): + Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to + the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. + All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None and labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + + outputs = self.wav2vec2_conformer( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states) + + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # retrieve loss input_lengths from attention_mask + attention_mask = ( + attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long) + ) + input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long) + + # assuming that padded tokens are filled with -100 + # when not being attended to + labels_mask = labels >= 0 + target_lengths = labels_mask.sum(-1) + flattened_targets = labels.masked_select(labels_mask) + + # ctc_loss doesn't support fp16 + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) + + with torch.backends.cudnn.flags(enabled=False): + loss = nn.functional.ctc_loss( + log_probs, + flattened_targets, + input_lengths, + target_lengths, + blank=self.config.pad_token_id, + reduction=self.config.ctc_loss_reduction, + zero_infinity=self.config.ctc_zero_infinity, + ) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@auto_docstring( + custom_intro=""" + Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like + SUPERB Keyword Spotting. + """ +) +class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)" + ) + self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) + self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2_conformer.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2_conformer.parameters(): + param.requires_grad = False + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2_conformer( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + hidden_states = self.projector(hidden_states) + if attention_mask is None: + pooled_output = hidden_states.mean(dim=1) + else: + padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 + pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)" + ) + self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.num_labels = config.num_labels + + self.init_weights() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2_conformer.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2_conformer.parameters(): + param.requires_grad = False + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2_conformer( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1)) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class AMSoftmaxLoss(nn.Module): + def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4): + super().__init__() + self.scale = scale + self.margin = margin + self.num_labels = num_labels + self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True) + self.loss = nn.CrossEntropyLoss() + + def forward(self, hidden_states, labels): + labels = labels.flatten() + weight = nn.functional.normalize(self.weight, dim=0) + hidden_states = nn.functional.normalize(hidden_states, dim=1) + cos_theta = torch.mm(hidden_states, weight) + psi = cos_theta - self.margin + + onehot = nn.functional.one_hot(labels, self.num_labels) + logits = self.scale * torch.where(onehot.bool(), psi, cos_theta) + loss = self.loss(logits, labels) + + return loss + + +class TDNNLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id] + self.out_conv_dim = config.tdnn_dim[layer_id] + self.kernel_size = config.tdnn_kernel[layer_id] + self.dilation = config.tdnn_dilation[layer_id] + + self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim) + self.activation = nn.ReLU() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + if is_peft_available(): + from peft.tuners.lora import LoraLayer + + if is_peft_available(): + if isinstance(self.kernel, LoraLayer): + warnings.warn( + "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. " + "You should exclude TDNNLayer from LoRA's target modules.", + ) + + # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up + hidden_states = hidden_states.transpose(1, 2) + weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2) + hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation) + hidden_states = hidden_states.transpose(1, 2) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +@auto_docstring( + custom_intro=""" + Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification. + """ +) +class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0]) + + tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))] + self.tdnn = nn.ModuleList(tdnn_layers) + + self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim) + self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim) + + self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels) + + self.init_weights() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2_conformer.feature_extractor._freeze_parameters() + + def freeze_base_model(self): + """ + Calling this function will disable the gradient computation for the base model so that its parameters will not + be updated during training. Only the classification head will be updated. + """ + for param in self.wav2vec2_conformer.parameters(): + param.requires_grad = False + + def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]): + """ + Computes the output length of the TDNN layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return (input_length - kernel_size) // stride + 1 + + for kernel_size in self.config.tdnn_kernel: + input_lengths = _conv_out_length(input_lengths, kernel_size, 1) + + return input_lengths + + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple, XVectorOutput]: + r""" + input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states + + outputs = self.wav2vec2_conformer( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.use_weighted_layer_sum: + hidden_states = outputs[_HIDDEN_STATES_START_POSITION] + hidden_states = torch.stack(hidden_states, dim=1) + norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) + hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) + else: + hidden_states = outputs[0] + + hidden_states = self.projector(hidden_states) + + for tdnn_layer in self.tdnn: + hidden_states = tdnn_layer(hidden_states) + + # Statistic Pooling + if attention_mask is None: + mean_features = hidden_states.mean(dim=1) + std_features = hidden_states.std(dim=1) + else: + feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1)) + tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths) + mean_features = [] + std_features = [] + for i, length in enumerate(tdnn_output_lengths): + mean_features.append(hidden_states[i, :length].mean(dim=0)) + std_features.append(hidden_states[i, :length].std(dim=0)) + mean_features = torch.stack(mean_features) + std_features = torch.stack(std_features) + statistic_pooling = torch.cat([mean_features, std_features], dim=-1) + + output_embeddings = self.feature_extractor(statistic_pooling) + logits = self.classifier(output_embeddings) + + loss = None + if labels is not None: + loss = self.objective(logits, labels) + + if not return_dict: + output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + + return XVectorOutput( + loss=loss, + logits=logits, + embeddings=output_embeddings, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "Wav2Vec2ConformerForAudioFrameClassification", + "Wav2Vec2ConformerForCTC", + "Wav2Vec2ConformerForPreTraining", + "Wav2Vec2ConformerForSequenceClassification", + "Wav2Vec2ConformerForXVector", + "Wav2Vec2ConformerModel", + "Wav2Vec2ConformerPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa6c20737d8060f8ceca29926f113c2fc176ac9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py @@ -0,0 +1,728 @@ +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...integrations.fsdp import is_fsdp_managed_module +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, Wav2Vec2BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, logging +from ..wav2vec2.modeling_wav2vec2 import ( + Wav2Vec2Adapter, + Wav2Vec2AdapterLayer, + Wav2Vec2FeatureEncoder, + Wav2Vec2FeatureProjection, + Wav2Vec2FeedForward, + Wav2Vec2ForAudioFrameClassification, + Wav2Vec2ForCTC, + Wav2Vec2ForPreTraining, + Wav2Vec2ForSequenceClassification, + Wav2Vec2ForXVector, + Wav2Vec2GumbelVectorQuantizer, + Wav2Vec2Model, + Wav2Vec2PositionalConvEmbedding, +) +from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig + + +logger = logging.get_logger(__name__) + +_HIDDEN_STATES_START_POSITION = 2 + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions. + """ +) +class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official + paper](https://huggingface.co/papers/2006.11477). + projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked + projected quantized states. + projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive + target vectors for contrastive loss. + codevector_perplexity (`torch.FloatTensor` of shape `(1,)`): + The perplexity of the codevector distribution, used to measure the diversity of the codebook. + contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The contrastive loss (L_m) as stated in the [official paper](https://huggingface.co/papers/2006.11477). + diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`): + The diversity loss (L_d) as stated in the [official paper](https://huggingface.co/papers/2006.11477). + """ + + loss: Optional[torch.FloatTensor] = None + projected_states: Optional[torch.FloatTensor] = None + projected_quantized_states: Optional[torch.FloatTensor] = None + codevector_perplexity: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + contrastive_loss: Optional[torch.FloatTensor] = None + diversity_loss: Optional[torch.FloatTensor] = None + + +class Wav2Vec2ConformerPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding): + pass + + +class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module): + """Rotary positional embedding + Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864 + """ + + def __init__(self, config): + super().__init__() + dim = config.hidden_size // config.num_attention_heads + base = config.rotary_embedding_base + + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + self.cached_sequence_length = None + self.cached_rotary_positional_embedding = None + + def forward(self, hidden_states): + sequence_length = hidden_states.shape[1] + + if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None: + return self.cached_rotary_positional_embedding + + self.cached_sequence_length = sequence_length + # Embeddings are computed in the dtype of the inv_freq constant + time_stamps = torch.arange(sequence_length).type_as(self.inv_freq) + freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq) + embeddings = torch.cat((freqs, freqs), dim=-1) + + cos_embeddings = embeddings.cos()[:, None, None, :] + sin_embeddings = embeddings.sin()[:, None, None, :] + # Computed embeddings are cast to the dtype of the hidden state inputs + self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states) + return self.cached_rotary_positional_embedding + + +class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module): + """Relative positional encoding module.""" + + def __init__(self, config): + super().__init__() + self.max_len = config.max_source_positions + self.d_model = config.hidden_size + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pe(self, x): + # Reset the positional encodings + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` is the position of query vector and `j` is the + # position of key vector. We use positive relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i (batch, 2*channel, dim) + hidden_states = self.pointwise_conv1(hidden_states) + # => (batch, channel, dim) + hidden_states = self.glu(hidden_states) + + # 1D Depthwise Conv + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = self.pointwise_conv2(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class Wav2Vec2ConformerSelfAttention(nn.Module): + """Construct an Wav2Vec2ConformerSelfAttention object. + Can be enhanced with rotary or relative position embeddings. + """ + + def __init__(self, config): + super().__init__() + + self.head_size = config.hidden_size // config.num_attention_heads + self.num_heads = config.num_attention_heads + self.position_embeddings_type = config.position_embeddings_type + + self.linear_q = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_k = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_v = nn.Linear(config.hidden_size, config.hidden_size) + self.linear_out = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(p=config.attention_dropout) + + if self.position_embeddings_type == "relative": + # linear transformation for positional encoding + self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size)) + self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size)) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + relative_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + # self-attention mechanism + batch_size, sequence_length, hidden_size = hidden_states.size() + + # make sure query/key states can be != value states + query_key_states = hidden_states + value_states = hidden_states + + if self.position_embeddings_type == "rotary": + if relative_position_embeddings is None: + raise ValueError( + "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'" + ) + query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings) + + # project query_key_states and value_states + query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size) + value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size) + + # => (batch, head, time1, d_k) + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + + if self.position_embeddings_type == "relative": + if relative_position_embeddings is None: + raise ValueError( + "`relative_position_embeddings` has to be defined when `self.position_embeddings_type ==" + " 'relative'" + ) + # apply relative_position_embeddings to qk scores + # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860 + scores = self._apply_relative_embeddings( + query=query, key=key, relative_position_embeddings=relative_position_embeddings + ) + else: + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size) + + # apply attention_mask if necessary + if attention_mask is not None: + scores = scores + attention_mask + + # => (batch, head, time1, time2) + probs = torch.softmax(scores, dim=-1) + probs = self.dropout(probs) + + # => (batch, head, time1, d_k) + hidden_states = torch.matmul(probs, value) + + # => (batch, time1, hidden_size) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size) + hidden_states = self.linear_out(hidden_states) + + return hidden_states, probs + + def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings): + batch_size, sequence_length, hidden_size = hidden_states.size() + hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size) + + cos = relative_position_embeddings[0, :sequence_length, ...] + sin = relative_position_embeddings[1, :sequence_length, ...] + + # rotate hidden_states with rotary embeddings + hidden_states = hidden_states.transpose(0, 1) + rotated_states_begin = hidden_states[..., : self.head_size // 2] + rotated_states_end = hidden_states[..., self.head_size // 2 :] + rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1) + hidden_states = (hidden_states * cos) + (rotated_states * sin) + hidden_states = hidden_states.transpose(0, 1) + + hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size) + + return hidden_states + + def _apply_relative_embeddings(self, query, key, relative_position_embeddings): + # 1. project positional embeddings + # => (batch, head, 2*time1-1, d_k) + proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings) + proj_relative_position_embeddings = proj_relative_position_embeddings.view( + relative_position_embeddings.size(0), -1, self.num_heads, self.head_size + ) + proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2) + proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3) + + # 2. Add bias to query + # => (batch, head, time1, d_k) + query = query.transpose(1, 2) + q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2) + q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2) + + # 3. attention score: first compute matrix a and matrix c + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + # => (batch, head, time1, time2) + scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1)) + + # 4. then compute matrix b and matrix d + # => (batch, head, time1, 2*time1-1) + scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings) + + # 5. shift matrix b and matrix d + zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype) + scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1) + scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2]) + scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape) + scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd) + scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1] + + # 6. sum matrices + # => (batch, head, time1, time2) + scores = (scores_ac + scores_bd) / math.sqrt(self.head_size) + + return scores + + +class Wav2Vec2ConformerEncoderLayer(GradientCheckpointingLayer): + """Conformer block based on https://huggingface.co/papers/2005.08100.""" + + def __init__(self, config): + super().__init__() + embed_dim = config.hidden_size + dropout = config.attention_dropout + + # Feed-forward 1 + self.ffn1_layer_norm = nn.LayerNorm(embed_dim) + self.ffn1 = Wav2Vec2ConformerFeedForward(config) + + # Self-Attention + self.self_attn_layer_norm = nn.LayerNorm(embed_dim) + self.self_attn_dropout = nn.Dropout(dropout) + self.self_attn = Wav2Vec2ConformerSelfAttention(config) + + # Conformer Convolution + self.conv_module = Wav2Vec2ConformerConvolutionModule(config) + + # Feed-forward 2 + self.ffn2_layer_norm = nn.LayerNorm(embed_dim) + self.ffn2 = Wav2Vec2ConformerFeedForward(config) + self.final_layer_norm = nn.LayerNorm(embed_dim) + + def forward( + self, + hidden_states, + attention_mask: Optional[torch.Tensor] = None, + relative_position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + # 1. Feed-Forward 1 layer + residual = hidden_states + hidden_states = self.ffn1_layer_norm(hidden_states) + hidden_states = self.ffn1(hidden_states) + hidden_states = hidden_states * 0.5 + residual + residual = hidden_states + + # 2. Self-Attention layer + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weigts = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + relative_position_embeddings=relative_position_embeddings, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_dropout(hidden_states) + hidden_states = hidden_states + residual + + # 3. Convolutional Layer + residual = hidden_states + hidden_states = self.conv_module(hidden_states) + hidden_states = residual + hidden_states + + # 4. Feed-Forward 2 Layer + residual = hidden_states + hidden_states = self.ffn2_layer_norm(hidden_states) + hidden_states = self.ffn2(hidden_states) + hidden_states = hidden_states * 0.5 + residual + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, attn_weigts + + +class Wav2Vec2ConformerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + if config.position_embeddings_type == "relative": + self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config) + elif config.position_embeddings_type == "rotary": + self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config) + else: + self.embed_positions = None + + self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens output 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0.0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + hidden_states = self.dropout(hidden_states) + + if self.embed_positions is not None: + relative_position_embeddings = self.embed_positions(hidden_states) + else: + relative_position_embeddings = None + + synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = torch.rand([]) + + skip_the_layer = self.training and dropout_probability < self.config.layerdrop + if not skip_the_layer or synced_gpus: + # under fsdp or deepspeed zero3 all gpus must run in sync + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + relative_position_embeddings=relative_position_embeddings, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2ConformerGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer): + pass + + +class Wav2Vec2ConformerAdapter(Wav2Vec2Adapter): + pass + + +class Wav2Vec2ConformerAdapterLayer(Wav2Vec2AdapterLayer): + pass + + +@auto_docstring +class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): + config: Wav2Vec2ConformerConfig + base_model_prefix = "wav2vec2_conformer" + main_input_name = "input_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init. + if isinstance(module, Wav2Vec2ConformerForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + # gumbel softmax requires special init + elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer): + module.weight_proj.weight.data.normal_(mean=0.0, std=1) + module.weight_proj.bias.data.zero_() + nn.init.uniform_(module.codevectors) + elif isinstance(module, Wav2Vec2ConformerSelfAttention): + if hasattr(module, "pos_bias_u"): + nn.init.xavier_uniform_(module.pos_bias_u) + if hasattr(module, "pos_bias_v"): + nn.init.xavier_uniform_(module.pos_bias_v) + elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding): + nn.init.normal_( + module.conv.weight, + mean=0, + std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)), + ) + nn.init.constant_(module.conv.bias, 0) + elif isinstance(module, Wav2Vec2ConformerFeatureProjection): + k = math.sqrt(1 / module.projection.in_features) + nn.init.uniform_(module.projection.weight, a=-k, b=k) + nn.init.uniform_(module.projection.bias, a=-k, b=k) + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + + def _get_feat_extract_output_lengths( + self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None + ): + """ + Computes the output length of the convolutional layers + """ + + add_adapter = self.config.add_adapter if add_adapter is None else add_adapter + + def _conv_out_length(input_length, kernel_size, stride): + # 1D convolutional layer output length formula taken + # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html + return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1 + + for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): + input_lengths = _conv_out_length(input_lengths, kernel_size, stride) + + if add_adapter: + for _ in range(self.config.num_adapter_layers): + input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride) + + return input_lengths + + def _get_feature_vector_attention_mask( + self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None + ): + # Effectively attention_mask.sum(-1), but not inplace to be able to run + # on inference mode. + non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1] + + output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter) + output_lengths = output_lengths.to(torch.long) + + batch_size = attention_mask.shape[0] + + attention_mask = torch.zeros( + (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device + ) + # these two operations makes sure that all values before the output lengths idxs are attended to + attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + return attention_mask + + +WAV2VEC2_CONFORMER_START_DOCSTRING = None # will be automatically redefined + + +Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput + + +class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel, Wav2Vec2Model): + def __init__(self, config: Wav2Vec2ConformerConfig): + Wav2Vec2ConformerPreTrainedModel.__init__(self, config) + self.config = config + self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config) + self.feature_projection = Wav2Vec2ConformerFeatureProjection(config) + + # model only needs masking vector if mask prob is > 0.0 + if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: + self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_()) + + self.encoder = Wav2Vec2ConformerEncoder(config) + + self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + +class Wav2Vec2ConformerForPreTraining(Wav2Vec2ForPreTraining): + def __init__(self, config: Wav2Vec2ConformerConfig): + super().__init__(config) + + def freeze_feature_extractor(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + +class Wav2Vec2ConformerForCTC(Wav2Vec2ForCTC): + def __init__(self, config, target_lang: Optional[str] = None): + r""" + target_lang (`str`, *optional*): + Language id of adapter weights. Adapter weights are stored in the format adapter..safetensors or + adapter..bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses 'eng' by + default. + """ + super().__init__(config) + + def tie_weights(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + def freeze_feature_extractor(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + def freeze_base_model(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + +class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ForSequenceClassification): + def __init__(self, config): + super().__init__(config) + + def freeze_feature_extractor(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + +class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification): + def __init__(self, config): + super().__init__(config) + + def freeze_feature_extractor(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + +class Wav2Vec2ConformerForXVector(Wav2Vec2ForXVector): + def __init__(self, config): + super().__init__(config) + + def freeze_feature_extractor(self): + raise AttributeError("Not needed for Wav2Vec2Conformer") + + +__all__ = [ + "Wav2Vec2ConformerForAudioFrameClassification", + "Wav2Vec2ConformerForCTC", + "Wav2Vec2ConformerForPreTraining", + "Wav2Vec2ConformerForSequenceClassification", + "Wav2Vec2ConformerForXVector", + "Wav2Vec2ConformerModel", + "Wav2Vec2ConformerPreTrainedModel", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..95fd10dab7092fb8571a332b1ecf2e9a334d8fc4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .processing_wav2vec2_with_lm import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..beb22ca86749f01cde572cad7fe0b19e01f1e415 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py @@ -0,0 +1,659 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Speech processor class for Wav2Vec2 +""" + +import os +import warnings +from collections.abc import Iterable +from contextlib import contextmanager, nullcontext +from dataclasses import dataclass +from multiprocessing import Pool, get_context, get_start_method +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np + +from ...processing_utils import ProcessorMixin +from ...utils import ModelOutput, logging, requires_backends + + +logger = logging.get_logger(__name__) + + +if TYPE_CHECKING: + from pyctcdecode import BeamSearchDecoderCTC + + from ...feature_extraction_utils import FeatureExtractionMixin + from ...tokenization_utils import PreTrainedTokenizerBase + + +ListOfDict = list[dict[str, Union[int, str]]] + + +@dataclass +class Wav2Vec2DecoderWithLMOutput(ModelOutput): + """ + Output type of [`Wav2Vec2DecoderWithLM`], with transcription. + + Args: + text (list of `str` or `str`): + Decoded logits in text from. Usually the speech transcription. + logit_score (list of `float` or `float`): + Total logit score of the beams associated with produced text. + lm_score (list of `float`): + Fused lm_score of the beams associated with produced text. + word_offsets (list of `list[dict[str, Union[int, str]]]` or `list[dict[str, Union[int, str]]]`): + Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets + can be used to compute time stamps for each word. + """ + + text: Union[list[list[str]], list[str], str] + logit_score: Union[list[list[float]], list[float], float] = None + lm_score: Union[list[list[float]], list[float], float] = None + word_offsets: Union[list[list[ListOfDict]], list[ListOfDict], ListOfDict] = None + + +class Wav2Vec2ProcessorWithLM(ProcessorMixin): + r""" + Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder + with language model support into a single processor for language model boosted speech recognition decoding. + + Args: + feature_extractor ([`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]): + An instance of [`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input. + tokenizer ([`Wav2Vec2CTCTokenizer`]): + An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input. + decoder (`pyctcdecode.BeamSearchDecoderCTC`): + An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input. + """ + + feature_extractor_class = "AutoFeatureExtractor" + tokenizer_class = "Wav2Vec2CTCTokenizer" + + def __init__( + self, + feature_extractor: "FeatureExtractionMixin", + tokenizer: "PreTrainedTokenizerBase", + decoder: "BeamSearchDecoderCTC", + ): + from pyctcdecode import BeamSearchDecoderCTC + + super().__init__(feature_extractor, tokenizer) + if not isinstance(decoder, BeamSearchDecoderCTC): + raise TypeError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}") + + if feature_extractor.__class__.__name__ not in ["Wav2Vec2FeatureExtractor", "SeamlessM4TFeatureExtractor"]: + raise ValueError( + f"`feature_extractor` has to be of type `Wav2Vec2FeatureExtractor` or `SeamlessM4TFeatureExtractor`, but is {type(feature_extractor)}" + ) + + # make sure that decoder's alphabet and tokenizer's vocab match in content + missing_decoder_tokens = self.get_missing_alphabet_tokens(decoder, tokenizer) + if len(missing_decoder_tokens) > 0: + raise ValueError( + f"The tokens {missing_decoder_tokens} are defined in the tokenizer's " + "vocabulary, but not in the decoder's alphabet. " + f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet." + ) + + self.decoder = decoder + self.current_processor = self.feature_extractor + self._in_target_context_manager = False + + def save_pretrained(self, save_directory): + super().save_pretrained(save_directory) + self.decoder.save_to_dir(save_directory) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate a [`Wav2Vec2ProcessorWithLM`] from a pretrained Wav2Vec2 processor. + + + + This class method is simply calling the feature extractor's + [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], Wav2Vec2CTCTokenizer's + [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], and + [`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`]. + + Please refer to the docstrings of the methods above for more information. + + + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a feature extractor file saved using the + [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. + - a path or url to a saved feature extractor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + **kwargs + Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and + [`PreTrainedTokenizer`] + """ + requires_backends(cls, "pyctcdecode") + from pyctcdecode import BeamSearchDecoderCTC + + feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + + if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path): + unigram_encoding = kwargs.get("unigram_encoding", "utf-8") + decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path, unigram_encoding) + else: + # BeamSearchDecoderCTC has no auto class + kwargs.pop("_from_auto", None) + # snapshot_download has no `trust_remote_code` flag + kwargs.pop("trust_remote_code", None) + + # make sure that only relevant filenames are downloaded + language_model_filenames = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*") + alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME + allow_patterns = [language_model_filenames, alphabet_filename] + + decoder = BeamSearchDecoderCTC.load_from_hf_hub( + pretrained_model_name_or_path, allow_patterns=allow_patterns, **kwargs + ) + + # set language model attributes + for attribute in ["alpha", "beta", "unk_score_offset", "score_boundary"]: + value = kwargs.pop(attribute, None) + + if value is not None: + cls._set_language_model_attribute(decoder, attribute, value) + + # make sure that decoder's alphabet and tokenizer's vocab match in content + missing_decoder_tokens = cls.get_missing_alphabet_tokens(decoder, tokenizer) + if len(missing_decoder_tokens) > 0: + raise ValueError( + f"The tokens {missing_decoder_tokens} are defined in the tokenizer's " + "vocabulary, but not in the decoder's alphabet. " + f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet." + ) + + return cls(feature_extractor=feature_extractor, tokenizer=tokenizer, decoder=decoder) + + @staticmethod + def _set_language_model_attribute(decoder: "BeamSearchDecoderCTC", attribute: str, value: float): + setattr(decoder.model_container[decoder._model_key], attribute, value) + + @property + def language_model(self): + return self.decoder.model_container[self.decoder._model_key] + + @staticmethod + def get_missing_alphabet_tokens(decoder, tokenizer): + from pyctcdecode.alphabet import BLANK_TOKEN_PTN, UNK_TOKEN, UNK_TOKEN_PTN + + # we need to make sure that all of the tokenizer's except the special tokens + # are present in the decoder's alphabet. Retrieve missing alphabet token + # from decoder + tokenizer_vocab_list = list(tokenizer.get_vocab().keys()) + + # replace special tokens + for i, token in enumerate(tokenizer_vocab_list): + if BLANK_TOKEN_PTN.match(token): + tokenizer_vocab_list[i] = "" + if token == tokenizer.word_delimiter_token: + tokenizer_vocab_list[i] = " " + if UNK_TOKEN_PTN.match(token): + tokenizer_vocab_list[i] = UNK_TOKEN + + # are any of the extra tokens no special tokenizer tokens? + missing_tokens = set(tokenizer_vocab_list) - set(decoder._alphabet.labels) + + return missing_tokens + + def __call__(self, *args, **kwargs): + """ + When used in normal mode, this method forwards all its arguments to the feature extractor's + [`~FeatureExtractionMixin.__call__`] and returns its output. If used in the context + [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to + Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two + methods for more information. + """ + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + if "raw_speech" in kwargs: + warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") + audio = kwargs.pop("raw_speech") + else: + audio = kwargs.pop("audio", None) + sampling_rate = kwargs.pop("sampling_rate", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if audio is None and text is None: + raise ValueError("You need to specify either an `audio` or `text` input to process.") + + if audio is not None: + inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs) + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + return inputs + + def pad(self, *args, **kwargs): + """ + When used in normal mode, this method forwards all its arguments to the feature extractor's + [`~FeatureExtractionMixin.pad`] and returns its output. If used in the context + [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to + Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods + for more information. + """ + # For backward compatibility + if self._in_target_context_manager: + return self.current_processor.pad(*args, **kwargs) + + input_features = kwargs.pop("input_features", None) + labels = kwargs.pop("labels", None) + if len(args) > 0: + input_features = args[0] + args = args[1:] + + if input_features is not None: + input_features = self.feature_extractor.pad(input_features, *args, **kwargs) + if labels is not None: + labels = self.tokenizer.pad(labels, **kwargs) + + if labels is None: + return input_features + elif input_features is None: + return labels + else: + input_features["labels"] = labels["input_ids"] + return input_features + + def batch_decode( + self, + logits: np.ndarray, + pool: Optional[Pool] = None, + num_processes: Optional[int] = None, + beam_width: Optional[int] = None, + beam_prune_logp: Optional[float] = None, + token_min_logp: Optional[float] = None, + hotwords: Optional[Iterable[str]] = None, + hotword_weight: Optional[float] = None, + alpha: Optional[float] = None, + beta: Optional[float] = None, + unk_score_offset: Optional[float] = None, + lm_score_boundary: Optional[bool] = None, + output_word_offsets: bool = False, + n_best: int = 1, + ): + """ + Batch decode output logits to audio transcription with language model support. + + + + This function makes use of Python's multiprocessing. Currently, multiprocessing is available only on Unix + systems (see this [issue](https://github.com/kensho-technologies/pyctcdecode/issues/65)). + + If you are decoding multiple batches, consider creating a `Pool` and passing it to `batch_decode`. Otherwise, + `batch_decode` will be very slow since it will create a fresh `Pool` for each call. See usage example below. + + + + Args: + logits (`np.ndarray`): + The logits output vector of the model representing the log probabilities for each token. + pool (`multiprocessing.Pool`, *optional*): + An optional user-managed pool. If not set, one will be automatically created and closed. The pool + should be instantiated *after* `Wav2Vec2ProcessorWithLM`. Otherwise, the LM won't be available to the + pool's sub-processes. + + + + Currently, only pools created with a 'fork' context can be used. If a 'spawn' pool is passed, it will + be ignored and sequential decoding will be used instead. + + + + num_processes (`int`, *optional*): + If `pool` is not set, number of processes on which the function should be parallelized over. Defaults + to the number of available CPUs. + beam_width (`int`, *optional*): + Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH. + beam_prune_logp (`int`, *optional*): + Beams that are much worse than best beam will be pruned Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP. + token_min_logp (`int`, *optional*): + Tokens below this logp are skipped unless they are argmax of frame Defaults to pyctcdecode's + DEFAULT_MIN_TOKEN_LOGP. + hotwords (`list[str]`, *optional*): + List of words with extra importance, can be OOV for LM + hotword_weight (`int`, *optional*): + Weight factor for hotword importance Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT. + alpha (`float`, *optional*): + Weight for language model during shallow fusion + beta (`float`, *optional*): + Weight for length score adjustment of during scoring + unk_score_offset (`float`, *optional*): + Amount of log score offset for unknown tokens + lm_score_boundary (`bool`, *optional*): + Whether to have kenlm respect boundaries when scoring + output_word_offsets (`bool`, *optional*, defaults to `False`): + Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate + and model downsampling rate to compute the time-stamps of transcribed words. + n_best (`int`, *optional*, defaults to `1`): + Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list + of lists of strings, `logit_score` will be a list of lists of floats, and `lm_score` will be a list of + lists of floats, where the length of the outer list will correspond to the batch size and the length of + the inner list will correspond to the number of returned hypotheses . The value should be >= 1. + + + + Please take a look at the Example of [`~Wav2Vec2ProcessorWithLM.decode`] to better understand how to + make use of `output_word_offsets`. [`~Wav2Vec2ProcessorWithLM.batch_decode`] works the same way with + batched output. + + + + Returns: + [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`]. + + Example: + See [Decoding multiple audios](#decoding-multiple-audios). + """ + + from pyctcdecode.constants import ( + DEFAULT_BEAM_WIDTH, + DEFAULT_HOTWORD_WEIGHT, + DEFAULT_MIN_TOKEN_LOGP, + DEFAULT_PRUNE_LOGP, + ) + + # set defaults + beam_width = beam_width if beam_width is not None else DEFAULT_BEAM_WIDTH + beam_prune_logp = beam_prune_logp if beam_prune_logp is not None else DEFAULT_PRUNE_LOGP + token_min_logp = token_min_logp if token_min_logp is not None else DEFAULT_MIN_TOKEN_LOGP + hotword_weight = hotword_weight if hotword_weight is not None else DEFAULT_HOTWORD_WEIGHT + + # reset params at every forward call. It's just a `set` method in pyctcdecode + self.decoder.reset_params( + alpha=alpha, beta=beta, unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary + ) + + # create multiprocessing pool and list numpy arrays + # filter out logits padding + logits_list = [array[(array != -100.0).all(axis=-1)] for array in logits] + + # create a pool if necessary while also using it as a context manager to close itself + if pool is None: + # fork is safe to use only on Unix, see "Contexts and start methods" section on + # multiprocessing's docs (https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) + default_context = get_start_method() + + if default_context == "fork": + cm = pool = get_context().Pool(num_processes) + else: + logger.warning( + "Parallel batch decoding is not currently supported in this platform. " + "Falling back to sequential decoding." + ) + cm = nullcontext() + else: + # pool is managed by the user, so we don't need to close it + cm = nullcontext() + + if num_processes is not None: + logger.warning( + "Parameter `num_process` was passed, but it will be ignored since `pool` was also specified." + ) + + # pyctcdecode + with cm: + decoded_beams = self.decoder.decode_beams_batch( + pool=pool, + logits_list=logits_list, + beam_width=beam_width, + beam_prune_logp=beam_prune_logp, + token_min_logp=token_min_logp, + hotwords=hotwords, + hotword_weight=hotword_weight, + ) + + # extract text and scores + batch_texts, logit_scores, lm_scores, word_offsets = [], [], [], [] + + for d in decoded_beams: + batch_texts.append([beam[0] for beam in d]) + logit_scores.append([beam[-2] for beam in d]) + lm_scores.append([beam[-1] for beam in d]) + + # word_offsets.append([{"word": t[0], "start_offset": t[1][0], "end_offset": t[1][1]} for t in d[0][1]]) + + word_offsets.append( + [ + [ + {"word": word, "start_offset": start_offset, "end_offset": end_offset} + for word, (start_offset, end_offset) in beam[1] + ] + for beam in d + ] + ) + + word_offsets = word_offsets if output_word_offsets else None + + if n_best == 1: + return Wav2Vec2DecoderWithLMOutput( + text=[hyps[0] for hyps in batch_texts], + logit_score=[hyps[0] for hyps in logit_scores], + lm_score=[hyps[0] for hyps in lm_scores], + word_offsets=[hyps[0] for hyps in word_offsets] if word_offsets is not None else None, + ) + else: + return Wav2Vec2DecoderWithLMOutput( + text=[hyps[:n_best] for hyps in batch_texts], + logit_score=[hyps[:n_best] for hyps in logit_scores], + lm_score=[hyps[:n_best] for hyps in lm_scores], + word_offsets=[hyps[:n_best] for hyps in word_offsets] if word_offsets is not None else None, + ) + + def decode( + self, + logits: np.ndarray, + beam_width: Optional[int] = None, + beam_prune_logp: Optional[float] = None, + token_min_logp: Optional[float] = None, + hotwords: Optional[Iterable[str]] = None, + hotword_weight: Optional[float] = None, + alpha: Optional[float] = None, + beta: Optional[float] = None, + unk_score_offset: Optional[float] = None, + lm_score_boundary: Optional[bool] = None, + output_word_offsets: bool = False, + n_best: int = 1, + ): + """ + Decode output logits to audio transcription with language model support. + + Args: + logits (`np.ndarray`): + The logits output vector of the model representing the log probabilities for each token. + beam_width (`int`, *optional*): + Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH. + beam_prune_logp (`int`, *optional*): + A threshold to prune beams with log-probs less than best_beam_logp + beam_prune_logp. The value should + be <= 0. Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP. + token_min_logp (`int`, *optional*): + Tokens with log-probs below token_min_logp are skipped unless they are have the maximum log-prob for an + utterance. Defaults to pyctcdecode's DEFAULT_MIN_TOKEN_LOGP. + hotwords (`list[str]`, *optional*): + List of words with extra importance which can be missing from the LM's vocabulary, e.g. ["huggingface"] + hotword_weight (`int`, *optional*): + Weight multiplier that boosts hotword scores. Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT. + alpha (`float`, *optional*): + Weight for language model during shallow fusion + beta (`float`, *optional*): + Weight for length score adjustment of during scoring + unk_score_offset (`float`, *optional*): + Amount of log score offset for unknown tokens + lm_score_boundary (`bool`, *optional*): + Whether to have kenlm respect boundaries when scoring + output_word_offsets (`bool`, *optional*, defaults to `False`): + Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate + and model downsampling rate to compute the time-stamps of transcribed words. + n_best (`int`, *optional*, defaults to `1`): + Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list + of strings, `logit_score` will be a list of floats, and `lm_score` will be a list of floats, where the + length of these lists will correspond to the number of returned hypotheses. The value should be >= 1. + + + + Please take a look at the example below to better understand how to make use of `output_word_offsets`. + + + + Returns: + [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`]. + + Example: + + ```python + >>> # Let's see how to retrieve time steps for a model + >>> from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC + >>> from datasets import load_dataset + >>> import datasets + >>> import torch + + >>> # import model, feature extractor, tokenizer + >>> model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") + >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") + + >>> # load first sample of English common_voice + >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True) + >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) + >>> dataset_iter = iter(dataset) + >>> sample = next(dataset_iter) + + >>> # forward sample through model to get greedily predicted transcription ids + >>> input_values = processor(sample["audio"]["array"], return_tensors="pt").input_values + >>> with torch.no_grad(): + ... logits = model(input_values).logits[0].cpu().numpy() + + >>> # retrieve word stamps (analogous commands for `output_char_offsets`) + >>> outputs = processor.decode(logits, output_word_offsets=True) + >>> # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate + >>> time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate + + >>> word_offsets = [ + ... { + ... "word": d["word"], + ... "start_time": round(d["start_offset"] * time_offset, 2), + ... "end_time": round(d["end_offset"] * time_offset, 2), + ... } + ... for d in outputs.word_offsets + ... ] + >>> # compare word offsets with audio `en_train_0/common_voice_en_19121553.mp3` online on the dataset viewer: + >>> # https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/en + >>> word_offsets[:4] + [{'word': 'THE', 'start_time': 0.68, 'end_time': 0.78}, {'word': 'TRACK', 'start_time': 0.88, 'end_time': 1.1}, {'word': 'APPEARS', 'start_time': 1.18, 'end_time': 1.66}, {'word': 'ON', 'start_time': 1.86, 'end_time': 1.92}] + ```""" + + from pyctcdecode.constants import ( + DEFAULT_BEAM_WIDTH, + DEFAULT_HOTWORD_WEIGHT, + DEFAULT_MIN_TOKEN_LOGP, + DEFAULT_PRUNE_LOGP, + ) + + # set defaults + beam_width = beam_width if beam_width is not None else DEFAULT_BEAM_WIDTH + beam_prune_logp = beam_prune_logp if beam_prune_logp is not None else DEFAULT_PRUNE_LOGP + token_min_logp = token_min_logp if token_min_logp is not None else DEFAULT_MIN_TOKEN_LOGP + hotword_weight = hotword_weight if hotword_weight is not None else DEFAULT_HOTWORD_WEIGHT + + # reset params at every forward call. It's just a `set` method in pyctcdecode + self.decoder.reset_params( + alpha=alpha, beta=beta, unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary + ) + + # pyctcdecode + decoded_beams = self.decoder.decode_beams( + logits, + beam_width=beam_width, + beam_prune_logp=beam_prune_logp, + token_min_logp=token_min_logp, + hotwords=hotwords, + hotword_weight=hotword_weight, + ) + + word_offsets = None + if output_word_offsets: + word_offsets = [ + [ + {"word": word, "start_offset": start_offset, "end_offset": end_offset} + for word, (start_offset, end_offset) in beam[2] + ] + for beam in decoded_beams + ] + logit_scores = [beam[-2] for beam in decoded_beams] + + lm_scores = [beam[-1] for beam in decoded_beams] + + hypotheses = [beam[0] for beam in decoded_beams] + + if n_best > len(decoded_beams): + logger.info( + "N-best size is larger than the number of generated hypotheses, all hypotheses will be returned." + ) + + if n_best == 1: + return Wav2Vec2DecoderWithLMOutput( + text=hypotheses[0], + logit_score=logit_scores[0], + lm_score=lm_scores[0], + word_offsets=word_offsets[0] if word_offsets is not None else None, + ) + else: + return Wav2Vec2DecoderWithLMOutput( + text=hypotheses[:n_best], + logit_score=logit_scores[:n_best], + lm_score=lm_scores[:n_best], + word_offsets=word_offsets[:n_best] if word_offsets is not None else None, + ) + + @contextmanager + def as_target_processor(self): + """ + Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning + Wav2Vec2. + """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your audio inputs, or in a separate call." + ) + self._in_target_context_manager = True + self.current_processor = self.tokenizer + yield + self.current_processor = self.feature_extractor + self._in_target_context_manager = False + + +__all__ = ["Wav2Vec2ProcessorWithLM"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d230bf3f0924b436e1b9cd9ea509a989cf5bdb8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_wavlm import * + from .modeling_wavlm import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd241e4524f2e28ef39d5d2bc7c940b10bf879ba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/configuration_wavlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/configuration_wavlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4141f877c6571c7b0263064c97e9dc31ce1072bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/configuration_wavlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modeling_wavlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modeling_wavlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f02b9b878618494636cd0bd68e1b855af9ba2d9b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modeling_wavlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modular_wavlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modular_wavlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e062c1b82560126b7b05fe4830757cf1bc700e0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modular_wavlm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/configuration_wavlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/configuration_wavlm.py new file mode 100644 index 0000000000000000000000000000000000000000..a137792e2a8da48684738ea396e6704dc849613a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/configuration_wavlm.py @@ -0,0 +1,337 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors, Microsoft Research, and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""WavLM model configuration""" + +import functools +import operator + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class WavLMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`WavLMModel`]. It is used to instantiate an WavLM + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the WavLM + [microsoft/wavlm-base](https://huggingface.co/microsoft/wavlm-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32): + Vocabulary size of the WavLM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`WavLMModel`]. Vocabulary size of the model. Defines the different tokens + that can be represented by the *inputs_ids* passed to the forward method of [`WavLMModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + activation_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for activations inside the fully connected layer. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + final_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the final projection layer of [`WavLMForCTC`]. + layerdrop (`float`, *optional*, defaults to 0.1): + The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more + details. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + feat_extract_norm (`str`, *optional*, defaults to `"group"`): + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group + normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D + convolutional layers. + feat_proj_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for output of the feature encoder. + feat_extract_activation (`str, `optional`, defaults to `"gelu"`): + The non-linear activation function (function or string) in the 1D convolutional layers of the feature + extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + conv_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): + A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. + conv_stride (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length + of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. + conv_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The + length of *conv_kernel* defines the number of convolutional layers and has to match the length of + *conv_dim*. + conv_bias (`bool`, *optional*, defaults to `False`): + Whether the 1D convolutional layers have a bias. + num_conv_pos_embeddings (`int`, *optional*, defaults to 128): + Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional + embeddings layer. + num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): + Number of groups of 1D convolutional positional embeddings layer. + do_stable_layer_norm (`bool`, *optional*, defaults to `False`): + Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is + True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is + False` corresponds to applying layer norm after the attention layer. + apply_spec_augment (`bool`, *optional*, defaults to `True`): + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see + [SpecAugment: A Simple Data Augmentation Method for Automatic Speech + Recognition](https://huggingface.co/papers/1904.08779). + mask_time_prob (`float`, *optional*, defaults to 0.05): + Probability of each feature vector along the time axis to be chosen as the start of the vector span to be + masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked + along the time axis. This is only relevant if `apply_spec_augment is True`. + mask_time_length (`int`, *optional*, defaults to 10): + Length of vector span along the time axis. + mask_time_min_masks (`int`, *optional*, defaults to 2),: + The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, + irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < + mask_time_min_masks'' + mask_feature_prob (`float`, *optional*, defaults to 0.0): + Probability of each feature vector along the feature axis to be chosen as the start of the vector span to + be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked + along the time axis. This is only relevant if `apply_spec_augment is True`. + mask_feature_length (`int`, *optional*, defaults to 10): + Length of vector span along the feature axis. + num_codevectors_per_group (`int`, *optional*, defaults to 320): + Number of entries in each quantization codebook (group). + num_codevector_groups (`int`, *optional*, defaults to 2): + Number of codevector groups for product codevector quantization. + contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): + The temperature *kappa* in the contrastive loss. + num_negatives (`int`, *optional*, defaults to 100): + Number of negative samples for the contrastive loss. + codevector_dim (`int`, *optional*, defaults to 256): + Dimensionality of the quantized feature vectors. + proj_codevector_dim (`int`, *optional*, defaults to 256): + Dimensionality of the final projection of both the quantized and the transformer features. + diversity_loss_weight (`int`, *optional*, defaults to 0.1): + The weight of the codebook diversity loss component. + ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`): + Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an + instance of [`WavLMForCTC`]. + ctc_zero_infinity (`bool`, *optional*, defaults to `False`): + Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly + occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance + of [`WavLMForCTC`]. + use_weighted_layer_sum (`bool`, *optional*, defaults to `False`): + Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an + instance of [`WavLMForSequenceClassification`]. + classifier_proj_size (`int`, *optional*, defaults to 256): + Dimensionality of the projection before token mean-pooling for classification. + tdnn_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`): + A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN* + module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers. + tdnn_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the + *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*. + tdnn_dilation (`tuple[int]` or `list[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`): + A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the + *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. + xvector_output_dim (`int`, *optional*, defaults to 512): + Dimensionality of the *XVector* embedding vectors. + add_adapter (`bool`, *optional*, defaults to `False`): + Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for + warm-starting Wav2Vec2 for SpeechEncoderDecoder models. + adapter_kernel_size (`int`, *optional*, defaults to 3): + Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + adapter_stride (`int`, *optional*, defaults to 2): + Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`. + num_adapter_layers (`int`, *optional*, defaults to 3): + Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is + True`. + output_hidden_size (`int`, *optional*): + Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant + if `add_adapter is True`. + + Example: + + ```python + + ``` + + Example: + + ```python + >>> from transformers import WavLMConfig, WavLMModel + + >>> # Initializing a WavLM facebook/wavlm-base-960h style configuration + >>> configuration = WavLMConfig() + + >>> # Initializing a model (with random weights) from the facebook/wavlm-base-960h style configuration + >>> model = WavLMModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "wavlm" + + def __init__( + self, + vocab_size=32, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout=0.1, + activation_dropout=0.1, + attention_dropout=0.1, + feat_proj_dropout=0.0, + final_dropout=0.1, + layerdrop=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + feat_extract_norm="group", + feat_extract_activation="gelu", + conv_dim=(512, 512, 512, 512, 512, 512, 512), + conv_stride=(5, 2, 2, 2, 2, 2, 2), + conv_kernel=(10, 3, 3, 3, 3, 2, 2), + conv_bias=False, + num_conv_pos_embeddings=128, + num_conv_pos_embedding_groups=16, + num_buckets=320, + max_bucket_distance=800, + do_stable_layer_norm=False, + apply_spec_augment=True, + mask_time_prob=0.05, + mask_time_length=10, + mask_time_min_masks=2, + mask_feature_prob=0.0, + mask_feature_length=10, + num_codevectors_per_group=320, + num_codevector_groups=2, + contrastive_logits_temperature=0.1, + num_negatives=100, + codevector_dim=256, + proj_codevector_dim=256, + diversity_loss_weight=0.1, + ctc_loss_reduction="mean", + ctc_zero_infinity=False, + use_weighted_layer_sum=False, + classifier_proj_size=256, + tdnn_dim=(512, 512, 512, 512, 1500), + tdnn_kernel=(5, 3, 3, 1, 1), + tdnn_dilation=(1, 2, 3, 1, 1), + xvector_output_dim=512, + num_ctc_classes=80, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + add_adapter=False, + adapter_kernel_size=3, + adapter_stride=2, + num_adapter_layers=3, + output_hidden_size=None, + **kwargs, + ): + super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) + self.hidden_size = hidden_size + self.feat_extract_norm = feat_extract_norm + self.feat_extract_activation = feat_extract_activation + self.conv_dim = list(conv_dim) + self.conv_stride = list(conv_stride) + self.conv_kernel = list(conv_kernel) + self.conv_bias = conv_bias + self.num_buckets = num_buckets + self.max_bucket_distance = max_bucket_distance + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.num_feat_extract_layers = len(self.conv_dim) + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_attention_heads = num_attention_heads + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.feat_proj_dropout = feat_proj_dropout + self.final_dropout = final_dropout + self.layerdrop = layerdrop + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.num_ctc_classes = num_ctc_classes + self.vocab_size = vocab_size + self.do_stable_layer_norm = do_stable_layer_norm + self.use_weighted_layer_sum = use_weighted_layer_sum + self.classifier_proj_size = classifier_proj_size + + if ( + (len(self.conv_stride) != self.num_feat_extract_layers) + or (len(self.conv_kernel) != self.num_feat_extract_layers) + or (len(self.conv_dim) != self.num_feat_extract_layers) + ): + raise ValueError( + "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` ==" + " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) =" + f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`," + f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." + ) + + # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779 + self.apply_spec_augment = apply_spec_augment + self.mask_time_prob = mask_time_prob + self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks + self.mask_feature_prob = mask_feature_prob + self.mask_feature_length = mask_feature_length + + # parameters for pretraining with codevector quantized representations + self.num_codevectors_per_group = num_codevectors_per_group + self.num_codevector_groups = num_codevector_groups + self.contrastive_logits_temperature = contrastive_logits_temperature + self.num_negatives = num_negatives + self.codevector_dim = codevector_dim + self.proj_codevector_dim = proj_codevector_dim + self.diversity_loss_weight = diversity_loss_weight + + # ctc loss + self.ctc_loss_reduction = ctc_loss_reduction + self.ctc_zero_infinity = ctc_zero_infinity + + # adapter + self.add_adapter = add_adapter + self.adapter_kernel_size = adapter_kernel_size + self.adapter_stride = adapter_stride + self.num_adapter_layers = num_adapter_layers + self.output_hidden_size = output_hidden_size or hidden_size + + # SequenceClassification-specific parameter. Feel free to ignore for other classes. + self.classifier_proj_size = classifier_proj_size + + # XVector-specific parameters. Feel free to ignore for other classes. + self.tdnn_dim = list(tdnn_dim) + self.tdnn_kernel = list(tdnn_kernel) + self.tdnn_dilation = list(tdnn_dilation) + self.xvector_output_dim = xvector_output_dim + + @property + def inputs_to_logits_ratio(self): + return functools.reduce(operator.mul, self.conv_stride, 1) + + +__all__ = ["WavLMConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/english_normalizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/english_normalizer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b71c5c63e23670419c1b0c97f8bb0a9213d84eab Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/english_normalizer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/feature_extraction_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/feature_extraction_whisper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fc628177ebbd5b34d85e1c85ec0824a69f30d77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/feature_extraction_whisper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/generation_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/generation_whisper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3fc4757392daf84222bbc29dbc6c9da0e0249a87 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/generation_whisper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c1c3fa29edc0a982f638b9c9459df11cac82fef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79493a1fced057403a6d50f08cf614f2df4228b2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1c3a3aa68efdaa624dde59fb3fae6b3d479bcc0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/configuration_x_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/configuration_x_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a782794fc81127a57dae5cabf60ad0c55abcc61 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/configuration_x_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/modeling_x_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/modeling_x_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8946c31fdb0e6b7d8189bd139a5496d373daaa5e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/modeling_x_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/processing_x_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/processing_x_clip.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dffe7a4b70dcba7e15e5782e74990de68936a36c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/processing_x_clip.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d668f03813127910b42fb3bf2f8e8c7375381f0d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5793f62222eca9fb99ed98650e17ee4946a1c196 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ca7203d5f09064fc7e02c27fe4e22ba26f557cc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23a184e84fdd9f7cda75026b5ce4d0d2a174c2b2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bfdb86334885bd1d78908cbeead4e870b3f76a7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..939bf84ffce488ad72f1117ba107c76d097c9b3d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a47535ffbe8dff81b391c0319b164ff40ea1a142 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5359628fbcc9e60703dbc9a97528b51a60ab07c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/configuration_xlnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/configuration_xlnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ccf9e62084a629193b011319490905ab5e41496 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/configuration_xlnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/modeling_tf_xlnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/modeling_tf_xlnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1127b7b3e54f9b92e0860b288ae7e1791302078d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/modeling_tf_xlnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89d8cc26022cb976b50fc9f51bce43b01c4a0b2c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad2d74a14f252ff4469b9eae812462e642cdf0bf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet_fast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c26dd9a5e2106e07559a5a19c3e46fdd42c20c0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2481db319e01b8e658d75a4a00f737183535875 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8db6ee7fed0a48fb0f12c2577d4d44b1366c177b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48a233755de26862db8489d083ffc7f2809e821d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_zamba import * + from .modeling_zamba import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/configuration_zamba.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/configuration_zamba.py new file mode 100644 index 0000000000000000000000000000000000000000..3de9a19d1bd73e52b5f9819bbe67cc816bc10503 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/configuration_zamba.py @@ -0,0 +1,227 @@ +# coding=utf-8 +# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Zamba model configuration""" + +import math + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ZambaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ZambaModel`]. It is used to instantiate a + Zamba model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Zamba-v0.1 model. + + [Zyphra/Zamba-7B-v1](https://huggingface.co/Zyphra/Zamba-7B-v1) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Zamba model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ZambaModel`] + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the + model has a output word embedding layer. + hidden_size (`int`, *optional*, defaults to 3712): + Dimension of the hidden representations. + attention_hidden_size (`int`, *optional*): + Dimension of the hidden representations of the inputs to the Attention layer. + intermediate_size (`int`, *optional*, defaults to 14848): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 76): + Number of hidden layers in the model. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + attention_head_dim (`int`, *optional*): + Dimension of the attention head in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). + n_mamba_heads (`int`, *optional*, defaults to 2): + Number of mamba heads for each mamba layer. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the decoder. + hidden_mamba_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the mamba layer. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + num_logits_to_keep (`int` or `None`, *optional*, defaults to 1): + Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an + integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the + logits of the last prompt token are needed for generation. For long sequences, the logits for the entire + sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint + significantly. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + max_position_embeddings (`int`, *optional*, defaults to 4096): + This value doesn't have any real effect. The maximum sequence length that this model is intended to be + used with. It can be used with longer sequences, but performance may degrade. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + attn_layer_period (`int`, *optional*, defaults to 6): + Once in this many layers, we will have a shared attention layer + attn_layer_offset (`int`, *optional*, defaults to 4): + Offset of the shared attention layer + use_mamba_kernels (`bool`, *optional*, defaults to `True`): + Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and + `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if + `True` and kernels are not available + mamba_d_state (`int`, *optional*, defaults to 16): + The dimension the mamba state space latents + mamba_d_conv (`int`, *optional*, defaults to 4): + The size of the mamba convolution kernel + mamba_expand (`int`, *optional*, defaults to 2): + Expanding factor (relative to hidden_size) used to determine the mamba intermediate size + mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`): + Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` + time_step_min (`float`, *optional*, defaults to 0.001): + Minimum `time_step` used to bound `dt_proj_bias`. + time_step_max (`float`, *optional*, defaults to 0.1): + Maximum `time_step` used to bound `dt_proj_bias`. + time_step_floor (`float`, *optional*, defaults to 0.0001): + Minimum clamping value of the `dt_proj.bias` layer initialization. + mamba_conv_bias (`bool`, *optional*, defaults to `True`): + Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block. + mamba_proj_bias (`bool`, *optional*, defaults to `False`): + Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block + + """ + + model_type = "zamba" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + tie_word_embeddings=True, + hidden_size=3712, + attention_hidden_size=None, + intermediate_size=14848, + num_hidden_layers=76, + num_attention_heads=16, + attention_head_dim=None, + num_key_value_heads=16, + n_mamba_heads=2, + hidden_act="gelu", + hidden_mamba_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + num_logits_to_keep=1, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + max_position_embeddings=4096, + attention_dropout=0.0, + attn_layer_period=6, + attn_layer_offset=4, + use_mamba_kernels=True, + mamba_d_state=16, + mamba_d_conv=4, + mamba_expand=2, + mamba_dt_rank="auto", + time_step_min=0.001, + time_step_max=0.1, + time_step_floor=1e-4, + mamba_conv_bias=True, + mamba_proj_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.tie_word_embeddings = tie_word_embeddings + self.hidden_size = hidden_size + if attention_hidden_size is None: + self.attention_hidden_size = 2 * hidden_size + else: + self.attention_hidden_size = attention_hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + if attention_head_dim is None: + self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads + else: + self.attention_head_dim = attention_head_dim + self.max_position_embeddings = max_position_embeddings + self.attention_dropout = attention_dropout + + self.num_key_value_heads = num_key_value_heads + self.n_mamba_heads = n_mamba_heads + self.hidden_act = hidden_act + self.hidden_mamba_act = hidden_mamba_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + + self.use_cache = use_cache + self.num_logits_to_keep = num_logits_to_keep + + self.attn_layer_period = attn_layer_period + self.attn_layer_offset = attn_layer_offset + + self.use_mamba_kernels = use_mamba_kernels + self.mamba_d_state = mamba_d_state + self.mamba_d_conv = mamba_d_conv + self.mamba_expand = mamba_expand + self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank + self.time_step_min = time_step_min + self.time_step_max = time_step_max + self.time_step_floor = time_step_floor + self.mamba_conv_bias = mamba_conv_bias + self.mamba_proj_bias = mamba_proj_bias + + self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset) + + assert (self.mamba_expand * self.hidden_size) % self.n_mamba_heads == 0, ( + "`intermediate_size` should be divisible by `n_mamba_heads`." + ) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _layers_block_type(self, num_hidden_layers, attn_layer_period, attn_layer_offset): + layers = [ + "mamba", + "mamba", + "hybrid", + ] + ["hybrid" if i % attn_layer_period == attn_layer_offset else "mamba" for i in range(num_hidden_layers - 3)] + return layers + + +__all__ = ["ZambaConfig"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.py new file mode 100644 index 0000000000000000000000000000000000000000..dc95e1e550faa51a0c9a973dfc660d405f85fd28 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.py @@ -0,0 +1,1316 @@ +# coding=utf-8 +# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Zamba model.""" + +import math +from typing import Any, Callable, Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available +from .configuration_zamba import ZambaConfig + + +if is_mamba_ssm_available(): + from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn + from mamba_ssm.ops.triton.selective_state_update import selective_state_update +else: + selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + +is_fast_path_available = all( + (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn) +) + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Zamba +class ZambaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + ZambaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class ZambaHybridDynamicCache: + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + is_compileable = False + + def __init__(self, config, batch_size, dtype=torch.float16, device=None): + self.dtype = dtype + self.is_compileable = False + self.layers_block_type = config.layers_block_type + self.has_previous_state = False # only used by mamba + self.intermediate_size = config.mamba_expand * config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.n_mamba_heads = config.n_mamba_heads + self.conv_states = [] + self.ssm_states = [] + self.transformer_layers = [] + self._modules = {} + self._parameters = {} + self._buffers = {} + for i in range(config.num_hidden_layers): + self.conv_states += [ + torch.zeros(batch_size, self.intermediate_size, self.conv_kernel_size, device=device, dtype=dtype) + ] + cache_shape = ( + batch_size, + self.n_mamba_heads, + self.intermediate_size // self.n_mamba_heads, + self.ssm_state_size, + ) + self.ssm_states += [torch.zeros(cache_shape, device=device, dtype=dtype)] + if self.layers_block_type[i] == "hybrid": + self.transformer_layers.append(i) + + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + + def __len__(self): + return len(self.key_cache) + + def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]: + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.update + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[dict[str, Any]] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Update the cache + if self.key_cache[layer_idx].shape[-1] == 0: + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.reorder_cache + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + device = self.conv_states[layer_idx].device + self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device)) + device = self.ssm_states[layer_idx].device + self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device)) + + # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.get_seq_length + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # take any layer that contains cache and not empty tensor + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + if len(self.key_cache) <= layer_idx: + return 0 + return self.key_cache[layer_idx].shape[-2] + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class ZambaAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + + Adapted from transformers.models.mistral.modeling_mistral.MistralAttention: + The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads. + The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer + (see fig. 2 in https://huggingface.co/papers/2405.16712). + Additionally, replaced + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2) + """ + + def __init__(self, config: ZambaConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + + self.attention_hidden_size = config.attention_hidden_size + self.head_dim = config.attention_head_dim + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.scaling = (self.head_dim / 2) ** -0.5 + self.is_causal = True + self.attention_dropout = config.attention_dropout + + self.q_proj = nn.Linear(config.attention_hidden_size, config.num_attention_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + layer_idx: int, + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[ZambaHybridDynamicCache] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + if past_key_values is not None: + key_states, value_states = past_key_values.update(key_states, value_states, layer_idx) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class ZambaMambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, + and is why Mamba is called **selective** state spaces) + + This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways: + - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head + undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of + `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`. + """ + + def __init__(self, config: ZambaConfig, layer_idx): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = config.mamba_expand * config.hidden_size + self.time_step_rank = config.mamba_dt_rank + self.n_mamba_heads = config.n_mamba_heads + self.mamba_head_dim = self.intermediate_size // self.n_mamba_heads + self.use_conv_bias = config.mamba_conv_bias + self.use_bias = config.mamba_proj_bias + self.conv1d = nn.Conv1d( + in_channels=self.intermediate_size, + out_channels=self.intermediate_size, + bias=self.use_conv_bias, + kernel_size=self.conv_kernel_size, + groups=self.intermediate_size, + padding=self.conv_kernel_size - 1, + ) + + self.activation = config.hidden_mamba_act + self.act = ACT2FN[config.hidden_mamba_act] + + self.use_fast_kernels = config.use_mamba_kernels + + # projection of the input hidden states + self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=self.use_bias) + # weight associated to the selective projection used to make dt, B and C input dependent + # each mamba head is processed independently + self.x_proj_weight = nn.Parameter( + torch.zeros( + self.n_mamba_heads, + self.time_step_rank + self.ssm_state_size * 2, + self.mamba_head_dim, + ) + ) + # time step projection (discretization) + self.dt_proj_weight = nn.Parameter( + (torch.zeros(self.n_mamba_heads, self.mamba_head_dim, self.time_step_rank) - 0.5) + * 2 + / self.time_step_rank**0.5 + ) + self.dt_proj_bias = nn.Parameter(torch.zeros(self.n_mamba_heads, self.mamba_head_dim)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :] + A = A.expand(self.intermediate_size, -1).contiguous() + self.A_log = nn.Parameter(torch.log(A).reshape(self.n_mamba_heads, self.mamba_head_dim, -1)) + self.D = nn.Parameter(torch.ones(self.n_mamba_heads, self.mamba_head_dim)) + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias) + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" + " is None. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config" + ) + + def cuda_kernels_forward( + self, hidden_states: torch.Tensor, cache_params: ZambaHybridDynamicCache = None, attention_mask=None + ): + batch_size, seq_len, _ = hidden_states.shape + use_precomputed_states = cache_params is not None and cache_params.has_previous_state and seq_len == 1 + + # 1. Gated linear projection + projected_states = self.in_proj(hidden_states).transpose(1, 2) + + hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2) + hidden_states = hidden_states.squeeze(2).contiguous() + gate = gate.squeeze(2) + gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1) + + # 2. Convolution sequence transformation + conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) + if use_precomputed_states: + hidden_states = causal_conv1d_update( + hidden_states.squeeze(-1), + cache_params.conv_states[self.layer_idx], + conv_weights, + self.conv1d.bias, + self.activation, + ) + hidden_states = hidden_states.unsqueeze(-1) + else: + if attention_mask is not None and not torch.all(attention_mask == 1): + hidden_states = hidden_states * attention_mask.unsqueeze(1) + if cache_params is not None: + conv_states = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + hidden_states = causal_conv1d_fn(hidden_states, conv_weights, self.conv1d.bias, activation=self.activation) + if attention_mask is not None and not torch.all(attention_mask == 1): + hidden_states = hidden_states * attention_mask.unsqueeze(1) + + # 3. SSM sequence transformation + # 3.a. input varying initialization of time_step, B and C + + hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1) + ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2) + + time_step, B, C = torch.split( + ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1 + ) + + discrete_time_step = self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2) + + A = -torch.exp(self.A_log.float()) + + # 3.c perform the recurrence y ← SSM(A, B, C)(x) + time_proj_bias = self.dt_proj_bias.float() if self.dt_proj_bias is not None else None + scan_outputs = torch.empty((batch_size, 0, seq_len), device=hidden_states.device, dtype=hidden_states.dtype) + + if use_precomputed_states: + for n in range(self.n_mamba_heads): + scan_outputs_ = selective_state_update( + cache_params.ssm_states[self.layer_idx][:, n], + hidden_states[n, ..., 0], + discrete_time_step[n, ..., 0], + A[n], + B[n, :, 0], + C[n, :, 0], + self.D[n], + gate[n, ..., 0], + time_proj_bias[n], + dt_softplus=True, + ).unsqueeze(-1) + scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1) + + else: + ssm_state = torch.empty( + (batch_size, 0, self.mamba_head_dim, self.ssm_state_size), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + for n in range(self.n_mamba_heads): + scan_outputs_, ssm_state_ = selective_scan_fn( + hidden_states[n], + discrete_time_step[n], + A[n], + B[n].transpose(1, 2), + C[n].transpose(1, 2), + self.D[n].float(), + gate[n], + time_proj_bias[n], + delta_softplus=True, + return_last_state=True, + ) + scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1).contiguous() + ssm_state = torch.cat((ssm_state, ssm_state_.unsqueeze(1)), dim=1) + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_outputs.transpose(1, 2)) + return contextualized_states + + def slow_forward(self, input_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + # 1. Gated linear projection + projected_states = self.in_proj(input_states).transpose(1, 2) + + hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2) + hidden_states = hidden_states.squeeze(2).contiguous() + gate = gate.squeeze(2) + gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1) + + use_cache = isinstance(cache_params, ZambaHybridDynamicCache) + # 2. Convolution sequence transformation + if use_cache and cache_params.ssm_states[self.layer_idx].shape[0] == batch_size: + if self.training: + # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass + ssm_state = cache_params.ssm_states[self.layer_idx].clone() + else: + ssm_state = cache_params.ssm_states[self.layer_idx] + + ssm_state = ssm_state.to(hidden_states.device) + + if ( + cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] == batch_size + ): + conv_state = cache_params.conv_states[self.layer_idx] + conv_state = torch.roll(conv_state, shifts=-1, dims=-1) + conv_state[:, :, -1] = hidden_states[:, :, 0] + cache_params.conv_states[self.layer_idx] = conv_state + hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1) + if self.use_conv_bias: + hidden_states += self.conv1d.bias + hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1) + else: + if attention_mask is not None and not torch.all(attention_mask == 1): + hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1) + conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)) + cache_params.conv_states[self.layer_idx] = conv_state + hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len]) + if attention_mask is not None and not torch.all(attention_mask == 1): + hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1) + else: + ssm_state = torch.zeros( + (batch_size, self.n_mamba_heads, self.mamba_head_dim, self.ssm_state_size), + device=hidden_states.device, + dtype=dtype, + ) + if attention_mask is not None and not torch.all(attention_mask == 1): + hidden_states = hidden_states * attention_mask.unsqueeze(1) + hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len]) + if attention_mask is not None and not torch.all(attention_mask == 1): + hidden_states = hidden_states * attention_mask.unsqueeze(1) + + # 3. State Space Model sequence transformation + # 3.a. Selection: [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2] + hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1) + ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2) + + time_step, B, C = torch.split( + ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1 + ) + discrete_time_step = (self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2)) + self.dt_proj_bias[ + :, None, :, None + ] + + discrete_time_step = nn.functional.softplus(discrete_time_step) + + # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM) + A = -torch.exp(self.A_log.float()) + discrete_A = torch.exp(A[:, None, :, None, :] * discrete_time_step[:, :, :, :, None]) + discrete_B = discrete_time_step[:, :, :, :, None] * B[:, :, None, :, :].float() + deltaB_u = discrete_B * hidden_states[:, :, :, :, None].float() + # 3.c perform the recurrence y ← SSM(A, B, C)(x) + scan_outputs = [] + for i in range(seq_len): + ssm_state = discrete_A[:, :, :, i, :].transpose(0, 1) * ssm_state + deltaB_u[:, :, :, i, :].transpose(0, 1) + scan_output = torch.matmul(ssm_state.transpose(0, 1).to(dtype), C[:, :, i, :].unsqueeze(-1)) + scan_outputs.append(scan_output[:, :, :, 0]) + scan_output = torch.stack(scan_outputs, dim=-1) + scan_output = scan_output + (hidden_states * self.D[:, None, :, None]) + scan_output = scan_output * self.act(gate) + + if use_cache: + cache_params.ssm_states[self.layer_idx] = ssm_state + + # 4. Final linear projection + contextualized_states = self.out_proj( + scan_output.transpose(0, 1).reshape(batch_size, -1, seq_len).transpose(1, 2) + ) + return contextualized_states + + def forward(self, hidden_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None): + if self.use_fast_kernels: + if not is_fast_path_available or "cuda" not in self.x_proj_weight.device.type: + raise ValueError( + "Fast Mamba kernels are not available. Make sure to they are installed and that " + "the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' " + "and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config." + ) + return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask=attention_mask) + return self.slow_forward(hidden_states, cache_params, attention_mask=attention_mask) + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Zamba +class ZambaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class ZambaAttentionDecoderLayer(nn.Module): + def __init__(self, config: ZambaConfig, layer_idx: Optional[int] = None): + super().__init__() + self.self_attn = ZambaAttention(config, layer_idx) + + self.feed_forward = ZambaMLP(config) + self.input_layernorm = ZambaRMSNorm(config.attention_hidden_size, eps=config.rms_norm_eps) + self.pre_ff_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + original_hidden_states: torch.Tensor, + layer_idx: int, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[ZambaHybridDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)` + original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`. + This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The + concatenated tensor is then used as input of the pre-attention RMSNorm + (see fig. 2 in https://huggingface.co/papers/2405.16712). + layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers. + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + """ + hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1) + hidden_states = self.input_layernorm(hidden_states) + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + layer_idx=layer_idx, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + # feed-forward (MLP) + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class ZambaMambaDecoderLayer(nn.Module): + def __init__(self, config: ZambaConfig, layer_idx: int): + super().__init__() + self.mamba = ZambaMambaMixer(config=config, layer_idx=layer_idx) + self.input_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + original_hidden_states: Optional[torch.Tensor] = None, + layer_idx: Optional[int] = None, + attention_mask: Optional[torch.Tensor] = None, + causal_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[ZambaHybridDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + transformer_hidden_states: Optional[torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + """ + + residual = hidden_states + + # `transformer_hidden_states` is the output from shared transformer + linear layer (see fig. 2 in https://huggingface.co/papers/2405.16712). + # `transformer_hidden_states` is then added to the input to the mamba layer below (as described in eq. (6) of https://huggingface.co/papers/2405.16712). + hidden_states = ( + hidden_states + transformer_hidden_states if transformer_hidden_states is not None else hidden_states + ) + hidden_states = self.input_layernorm(hidden_states) + + hidden_states = self.mamba( + hidden_states=hidden_states, + cache_params=past_key_values, + attention_mask=attention_mask, + ) + + self_attn_weights = None + + # residual connection after mamba + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (past_key_values,) + + return outputs + + +class ZambaHybridLayer(nn.Module): + def __init__(self, shared_transf: ZambaAttentionDecoderLayer, linear: nn.Linear, mamba: ZambaMambaDecoderLayer): + super().__init__() + self.shared_transf = shared_transf + self.linear = linear + self.mamba_decoder = mamba + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + original_hidden_states: Optional[torch.Tensor] = None, + layer_idx: Optional[int] = None, + attention_mask: Optional[torch.Tensor] = None, + causal_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[ZambaHybridDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with + hidden activations to form the input of the shared transformer layer. + layer_idx (`int`): layer number. + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + """ + + layer_outputs = self.shared_transf( + hidden_states, + original_hidden_states=original_hidden_states, + layer_idx=layer_idx, + attention_mask=causal_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + transformer_hidden_states = layer_outputs[0] + + if output_attentions: + self_attn_weights = layer_outputs[1] + + transformer_hidden_states = self.linear(transformer_hidden_states) + + layer_outputs = self.mamba_decoder( + hidden_states, + transformer_hidden_states=transformer_hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + if output_attentions: + layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:] + + return layer_outputs + + +@auto_docstring +class ZambaPreTrainedModel(PreTrainedModel): + config: ZambaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["ZambaAttentionDecoderLayer", "ZambaMambaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = False + _supports_sdpa = False + # Note: only supports ZambaHybridDynamicCache + _is_stateful = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, ZambaRMSNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, ZambaMambaMixer): + module.x_proj_weight.data.normal_(mean=0.0, std=std) + dt_init_std = self.config.mamba_dt_rank**-0.5 + nn.init.uniform_(module.dt_proj_weight, -dt_init_std, dt_init_std) + + mamba_head_dim = self.config.mamba_expand * self.config.hidden_size // self.config.n_mamba_heads + dt = torch.exp( + torch.rand(self.config.n_mamba_heads, mamba_head_dim) + * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) + + math.log(self.config.time_step_min) + ).clamp(min=self.config.time_step_floor) + # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + inv_dt = dt + torch.log(-torch.expm1(-dt)) + module.dt_proj_bias.data.copy_(inv_dt) + + A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :] + A = A.expand(module.intermediate_size, -1).contiguous() + module.A_log.data.copy_(torch.log(A).reshape(module.n_mamba_heads, module.mamba_head_dim, -1)) + module.D.data.fill_(1.0) + + +@auto_docstring +class ZambaModel(ZambaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`] + + Args: + config: ZambaConfig + """ + + def __init__(self, config: ZambaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + block = ZambaAttentionDecoderLayer(config) + mamba_layers = [] + linear_layers = [] + self.layers_block_type = config.layers_block_type + for i in range(config.num_hidden_layers): + if config.layers_block_type[i] == "mamba": + mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i)) + elif config.layers_block_type[i] == "hybrid": + linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False)) + mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i)) + mamba_layers = iter(mamba_layers) + linear_layers = iter(linear_layers) + layers = [] + self._tied_weights_keys = [] + for layer_id, layer_type in enumerate(self.layers_block_type): + if layer_type == "hybrid": + prefix_name = f"layers.{layer_id}." + tied_keys = [ + "shared_transf.self_attn.q_proj.weight", + "shared_transf.self_attn.k_proj.weight", + "shared_transf.self_attn.v_proj.weight", + "shared_transf.self_attn.o_proj.weight", + "shared_transf.feed_forward.gate_proj.weight", + "shared_transf.feed_forward.up_proj.weight", + "shared_transf.feed_forward.down_proj.weight", + "shared_transf.input_layernorm.weight", + "shared_transf.pre_ff_layernorm.weight", + ] + self._tied_weights_keys = [*self._tied_weights_keys, *[prefix_name + key for key in tied_keys]] + layers.append(ZambaHybridLayer(block, next(linear_layers), next(mamba_layers))) + else: + layers.append(next(mamba_layers)) + self.layers = nn.ModuleList(layers) + + self._attn_implementation = config._attn_implementation + self.final_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[ZambaHybridDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + hidden_states = inputs_embeds + + original_hidden_states = torch.clone(inputs_embeds) + # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer + + if use_cache and past_key_values is None: + logger.warning_once( + "Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for layer_idx, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + original_hidden_states, + layer_idx, + attention_mask, + causal_mask, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = layer( + hidden_states, + original_hidden_states=original_hidden_states, + layer_idx=layer_idx, + attention_mask=attention_mask, + causal_mask=causal_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + if layer_outputs[1] is not None: + # append attentions only of attention layers. Mamba layers return `None` as the attention weights + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if past_key_values and not past_key_values.has_previous_state: + past_key_values.has_previous_state = True + + output = BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + return output if return_dict else output.to_tuple() + + # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask + def _update_causal_mask(self, attention_mask, input_tensor, cache_position): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + target_length = cache_position[-1] + 1 + + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +# Adapted from transformers.models.jamba.modeling_jamba.JambaForCausalLM with Jamba->Zamba, JAMBA->ZAMBA +class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin): + def __init__(self, config: ZambaConfig): + super().__init__(config) + self.model = ZambaModel(config) + self._tied_weights_keys = ["lm_head.weight", *self.model._tied_weights_keys] + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[ZambaHybridDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, ZambaForCausalLM + + >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1") + >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwritten -- has a unique cache type, `ZambaHybridDynamicCache` + + empty_past_kv = past_key_values is None + + # Omit tokens covered by past_key_values + if not empty_past_kv: + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = ZambaHybridDynamicCache( + self.config, input_ids.shape[0], dtype=self.dtype, device=self.device + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + +@auto_docstring( + custom_intro=""" + The Zamba Model with a sequence classification head on top (linear layer). + + [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """ +) +class ZambaForSequenceClassification(ZambaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = ZambaModel(config) + self._tied_weights_keys = self.model._tied_weights_keys + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) + token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = ["ZambaForCausalLM", "ZambaForSequenceClassification", "ZambaModel", "ZambaPreTrainedModel"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..755f06de5f2b50c5a8574cb70a482b65f217aa56 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/configuration_zamba2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/configuration_zamba2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54f645ae0ffa0675f417d86a1610183e92aa5171 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/configuration_zamba2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0b9b895f0ea0d0dda6751db50520079f0527ba8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modular_zamba2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modular_zamba2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..471ac1df0fe011d5119fa462078ba162efb8c42d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modular_zamba2.cpython-312.pyc differ